service: block duplicate node jobs and avoid busy builders

This commit is contained in:
codex 2026-04-19 21:54:51 -03:00
parent 611270127f
commit db000e8e48
10 changed files with 287 additions and 4 deletions

View File

@ -210,7 +210,13 @@ func (a *App) Build(node string) (*Job, error) {
if err := a.ensureReplacementReady(node); err != nil {
return nil, err
}
job := a.newJob("build", node, "", "")
if active := a.activeJobForNode(node); active != nil {
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
}
job, err := a.reserveJob("build", node, "", "")
if err != nil {
return nil, err
}
go a.runBuild(job, false)
return job, nil
}
@ -223,10 +229,16 @@ func (a *App) Replace(node, host, device string) (*Job, error) {
if err := a.ensureReplacementReady(node); err != nil {
return nil, err
}
if active := a.activeJobForNode(node); active != nil {
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
}
if _, err := a.ensureDevice(host, device); err != nil {
return nil, err
}
job := a.newJob("replace", node, host, device)
job, err := a.reserveJob("replace", node, host, device)
if err != nil {
return nil, err
}
go a.runBuild(job, true)
return job, nil
}

View File

@ -34,6 +34,76 @@ func (a *App) newJob(kind, node, host, device string) *Job {
return job
}
type activeNodeJobError struct {
Node string
Kind string
JobID string
}
func (e *activeNodeJobError) Error() string {
if e == nil {
return "node already has an active metis job"
}
return fmt.Sprintf("node %s already has an active %s job (%s)", e.Node, e.Kind, e.JobID)
}
func (a *App) activeJobForNode(node string) *Job {
a.mu.RLock()
defer a.mu.RUnlock()
return a.activeJobForNodeLocked(node)
}
func (a *App) activeJobForNodeLocked(node string) *Job {
node = strings.TrimSpace(node)
if node == "" {
return nil
}
var active *Job
for _, job := range a.jobs {
if job == nil || strings.TrimSpace(job.Node) != node {
continue
}
if job.Status != JobQueued && job.Status != JobRunning {
continue
}
switch job.Kind {
case "build", "replace":
default:
continue
}
if active == nil || job.StartedAt.Before(active.StartedAt) {
active = job
}
}
if active == nil {
return nil
}
copyJob := *active
return &copyJob
}
func (a *App) reserveJob(kind, node, host, device string) (*Job, error) {
a.mu.Lock()
defer a.mu.Unlock()
if active := a.activeJobForNodeLocked(node); active != nil {
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
}
now := time.Now().UTC()
job := &Job{
ID: fmt.Sprintf("%d", now.UnixNano()),
Kind: kind,
Node: node,
Host: host,
Device: device,
Status: JobQueued,
ProgressPct: 0,
StartedAt: now,
UpdatedAt: now,
}
a.jobs[job.ID] = job
return job, nil
}
func (a *App) job(id string) *Job {
a.mu.RLock()
defer a.mu.RUnlock()

View File

@ -158,6 +158,54 @@ func clusterNodes() []clusterNode {
return nodes
}
func clusterActiveRemotePodLoads(namespace, run string) map[string]int {
kube, err := kubeClientFactory()
if err != nil {
return nil
}
ns := url.PathEscape(strings.TrimSpace(namespace))
if ns == "" {
return nil
}
selector := "app=metis-remote"
if value := strings.TrimSpace(run); value != "" {
selector += ",metis-run=" + value
}
path := fmt.Sprintf("/api/v1/namespaces/%s/pods?labelSelector=%s", ns, url.QueryEscape(selector))
var payload struct {
Items []struct {
Metadata struct {
Labels map[string]string `json:"labels"`
} `json:"metadata"`
Spec struct {
NodeName string `json:"nodeName"`
} `json:"spec"`
Status struct {
Phase string `json:"phase"`
} `json:"status"`
} `json:"items"`
}
if err := kube.jsonRequest(http.MethodGet, path, nil, &payload); err != nil {
return nil
}
loads := map[string]int{}
for _, item := range payload.Items {
phase := strings.TrimSpace(item.Status.Phase)
if phase == "Succeeded" || phase == "Failed" {
continue
}
if value := strings.TrimSpace(run); value != "" && strings.TrimSpace(item.Metadata.Labels["metis-run"]) != value {
continue
}
nodeName := strings.TrimSpace(item.Spec.NodeName)
if nodeName == "" {
continue
}
loads[nodeName]++
}
return loads
}
func (a *App) podImageForArch(arch string) string {
switch strings.TrimSpace(arch) {
case "arm64":

View File

@ -121,3 +121,50 @@ func TestDeleteNodeObjectFallback(t *testing.T) {
t.Fatalf("deleteNodeObject fallback: %v", err)
}
}
func TestClusterActiveRemotePodLoadsCountsOnlyLivePods(t *testing.T) {
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
if got := r.URL.Query().Get("labelSelector"); got != "app=metis-remote,metis-run=build" {
t.Fatalf("unexpected labelSelector %q", got)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
"spec": map[string]any{"nodeName": "titan-04"},
"status": map[string]any{"phase": "Running"},
},
map[string]any{
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
"spec": map[string]any{"nodeName": "titan-04"},
"status": map[string]any{"phase": "Pending"},
},
map[string]any{
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
"spec": map[string]any{"nodeName": "titan-05"},
"status": map[string]any{"phase": "Succeeded"},
},
},
})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
origFactory := kubeClientFactory
kubeClientFactory = func() (*kubeClient, error) {
return kubeClientFactoryForURL(kube.URL, kube.Client()), nil
}
t.Cleanup(func() { kubeClientFactory = origFactory })
loads := clusterActiveRemotePodLoads("maintenance", "build")
if loads["titan-04"] != 2 {
t.Fatalf("expected titan-04 load 2, got %#v", loads)
}
if _, ok := loads["titan-05"]; ok {
t.Fatalf("expected succeeded pod to be ignored, got %#v", loads)
}
}

View File

@ -6,6 +6,7 @@ import (
"os"
"path/filepath"
"reflect"
"strings"
"testing"
"time"
@ -125,6 +126,17 @@ func TestAppJobDeviceAndStateHelpers(t *testing.T) {
}
}
func TestBuildAndReplaceRejectDuplicateActiveNodeJobs(t *testing.T) {
app := newTestApp(t)
active := app.newJob("replace", "titan-15", "titan-22", "/dev/sdz")
if _, err := app.Build("titan-15"); err == nil || !strings.Contains(err.Error(), active.ID) {
t.Fatalf("expected build conflict mentioning %s, got %v", active.ID, err)
}
if _, err := app.Replace("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "active replace job") {
t.Fatalf("expected replace conflict, got %v", err)
}
}
func TestAppPersistenceAndTargets(t *testing.T) {
dir := t.TempDir()
invPath := filepath.Join(dir, "inventory.yaml")

View File

@ -81,6 +81,8 @@ func (a *App) ensureDevice(host, path string) (*Device, error) {
func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) {
nodes := clusterNodes()
activeBuilds := clusterActiveRemotePodLoads(a.settings.Namespace, "build")
activeRemotePods := clusterActiveRemotePodLoads(a.settings.Namespace, "")
storageNodes := map[string]struct{}{}
for _, node := range a.inventory.Nodes {
if len(node.LonghornDisks) > 0 {
@ -119,6 +121,12 @@ func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) {
if flashHost != "" && node.Name == flashHost {
score += 5
}
if count := activeBuilds[node.Name]; count > 0 {
score -= 100 * count
}
if count := activeRemotePods[node.Name]; count > 0 {
score -= 15 * count
}
candidates = append(candidates, scored{node: node, score: score})
}
sort.Slice(candidates, func(i, j int) bool {

View File

@ -1,6 +1,9 @@
package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
@ -66,3 +69,60 @@ func TestSelectBuilderHostErrorBranch(t *testing.T) {
t.Fatal("expected selectBuilderHost error")
}
}
func TestSelectBuilderHostAvoidsBusyBuilderWhenPeersAreFree(t *testing.T) {
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{
"name": "titan-04",
"labels": map[string]string{
"kubernetes.io/arch": "arm64",
"hardware": "rpi5",
"node-role.kubernetes.io/worker": "true",
},
},
"spec": map[string]any{"unschedulable": false},
},
map[string]any{
"metadata": map[string]any{
"name": "titan-05",
"labels": map[string]string{
"kubernetes.io/arch": "arm64",
"hardware": "rpi5",
"node-role.kubernetes.io/worker": "true",
},
},
"spec": map[string]any{"unschedulable": false},
},
},
})
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
_ = json.NewEncoder(w).Encode(map[string]any{
"items": []any{
map[string]any{
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
"spec": map[string]any{"nodeName": "titan-04"},
"status": map[string]any{"phase": "Running"},
},
},
})
default:
http.NotFound(w, r)
}
}))
defer kube.Close()
installKubeFactory(t, kube)
app := newTestApp(t)
app.settings.Namespace = "maintenance"
node, err := app.selectBuilderHost("arm64", "")
if err != nil {
t.Fatalf("selectBuilderHost: %v", err)
}
if node.Name != "titan-05" {
t.Fatalf("expected titan-05 builder, got %s", node.Name)
}
}

View File

@ -2,6 +2,7 @@ package service
import (
"encoding/json"
"errors"
"html/template"
"net/http"
"strings"
@ -99,7 +100,7 @@ func (a *App) handleBuild(w http.ResponseWriter, r *http.Request) {
node := values["node"]
job, err := a.Build(node)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
http.Error(w, err.Error(), statusForJobError(err))
return
}
writeJSON(w, http.StatusAccepted, job)
@ -116,12 +117,20 @@ func (a *App) handleReplace(w http.ResponseWriter, r *http.Request) {
device := values["device"]
job, err := a.Replace(node, host, device)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
http.Error(w, err.Error(), statusForJobError(err))
return
}
writeJSON(w, http.StatusAccepted, job)
}
func statusForJobError(err error) int {
var conflict *activeNodeJobError
if errors.As(err, &conflict) {
return http.StatusConflict
}
return http.StatusBadRequest
}
func (a *App) handleWatch(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)

View File

@ -229,6 +229,21 @@ func TestRequestValuesFormAndAuthHelpers(t *testing.T) {
}
}
func TestHandleBuildReturnsConflictForDuplicateActiveNodeJob(t *testing.T) {
app := newTestApp(t)
app.newJob("build", "titan-15", "", "")
handler := app.Handler()
req := httptest.NewRequest(http.MethodPost, "/api/jobs/build", strings.NewReader(`{"node":"titan-15"}`))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Auth-Request-User", "brad")
req.Header.Set("X-Auth-Request-Groups", "admin")
resp := httptest.NewRecorder()
handler.ServeHTTP(resp, req)
if resp.Code != http.StatusConflict {
t.Fatalf("expected conflict, got %d: %s", resp.Code, resp.Body.String())
}
}
func TestHTTPHandlersExerciseErrorBranches(t *testing.T) {
kube := fakeKubeServer(t)
installKubeFactory(t, kube)

View File

@ -166,6 +166,8 @@ func fakeKubeServer(t *testing.T) *httptest.Server {
},
},
})
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
_ = json.NewEncoder(w).Encode(map[string]any{"items": []any{}})
case r.Method == http.MethodPost && strings.Contains(r.URL.Path, "/pods"):
w.WriteHeader(http.StatusCreated)
case r.Method == http.MethodDelete && strings.Contains(r.URL.Path, "/pods/"):