service: block duplicate node jobs and avoid busy builders
This commit is contained in:
parent
611270127f
commit
db000e8e48
@ -210,7 +210,13 @@ func (a *App) Build(node string) (*Job, error) {
|
|||||||
if err := a.ensureReplacementReady(node); err != nil {
|
if err := a.ensureReplacementReady(node); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
job := a.newJob("build", node, "", "")
|
if active := a.activeJobForNode(node); active != nil {
|
||||||
|
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
|
||||||
|
}
|
||||||
|
job, err := a.reserveJob("build", node, "", "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
go a.runBuild(job, false)
|
go a.runBuild(job, false)
|
||||||
return job, nil
|
return job, nil
|
||||||
}
|
}
|
||||||
@ -223,10 +229,16 @@ func (a *App) Replace(node, host, device string) (*Job, error) {
|
|||||||
if err := a.ensureReplacementReady(node); err != nil {
|
if err := a.ensureReplacementReady(node); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
if active := a.activeJobForNode(node); active != nil {
|
||||||
|
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
|
||||||
|
}
|
||||||
if _, err := a.ensureDevice(host, device); err != nil {
|
if _, err := a.ensureDevice(host, device); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
job := a.newJob("replace", node, host, device)
|
job, err := a.reserveJob("replace", node, host, device)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
go a.runBuild(job, true)
|
go a.runBuild(job, true)
|
||||||
return job, nil
|
return job, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,6 +34,76 @@ func (a *App) newJob(kind, node, host, device string) *Job {
|
|||||||
return job
|
return job
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type activeNodeJobError struct {
|
||||||
|
Node string
|
||||||
|
Kind string
|
||||||
|
JobID string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *activeNodeJobError) Error() string {
|
||||||
|
if e == nil {
|
||||||
|
return "node already has an active metis job"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("node %s already has an active %s job (%s)", e.Node, e.Kind, e.JobID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) activeJobForNode(node string) *Job {
|
||||||
|
a.mu.RLock()
|
||||||
|
defer a.mu.RUnlock()
|
||||||
|
return a.activeJobForNodeLocked(node)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) activeJobForNodeLocked(node string) *Job {
|
||||||
|
node = strings.TrimSpace(node)
|
||||||
|
if node == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var active *Job
|
||||||
|
for _, job := range a.jobs {
|
||||||
|
if job == nil || strings.TrimSpace(job.Node) != node {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if job.Status != JobQueued && job.Status != JobRunning {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch job.Kind {
|
||||||
|
case "build", "replace":
|
||||||
|
default:
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if active == nil || job.StartedAt.Before(active.StartedAt) {
|
||||||
|
active = job
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if active == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
copyJob := *active
|
||||||
|
return ©Job
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) reserveJob(kind, node, host, device string) (*Job, error) {
|
||||||
|
a.mu.Lock()
|
||||||
|
defer a.mu.Unlock()
|
||||||
|
if active := a.activeJobForNodeLocked(node); active != nil {
|
||||||
|
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
|
||||||
|
}
|
||||||
|
now := time.Now().UTC()
|
||||||
|
job := &Job{
|
||||||
|
ID: fmt.Sprintf("%d", now.UnixNano()),
|
||||||
|
Kind: kind,
|
||||||
|
Node: node,
|
||||||
|
Host: host,
|
||||||
|
Device: device,
|
||||||
|
Status: JobQueued,
|
||||||
|
ProgressPct: 0,
|
||||||
|
StartedAt: now,
|
||||||
|
UpdatedAt: now,
|
||||||
|
}
|
||||||
|
a.jobs[job.ID] = job
|
||||||
|
return job, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) job(id string) *Job {
|
func (a *App) job(id string) *Job {
|
||||||
a.mu.RLock()
|
a.mu.RLock()
|
||||||
defer a.mu.RUnlock()
|
defer a.mu.RUnlock()
|
||||||
|
|||||||
@ -158,6 +158,54 @@ func clusterNodes() []clusterNode {
|
|||||||
return nodes
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func clusterActiveRemotePodLoads(namespace, run string) map[string]int {
|
||||||
|
kube, err := kubeClientFactory()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
ns := url.PathEscape(strings.TrimSpace(namespace))
|
||||||
|
if ns == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
selector := "app=metis-remote"
|
||||||
|
if value := strings.TrimSpace(run); value != "" {
|
||||||
|
selector += ",metis-run=" + value
|
||||||
|
}
|
||||||
|
path := fmt.Sprintf("/api/v1/namespaces/%s/pods?labelSelector=%s", ns, url.QueryEscape(selector))
|
||||||
|
var payload struct {
|
||||||
|
Items []struct {
|
||||||
|
Metadata struct {
|
||||||
|
Labels map[string]string `json:"labels"`
|
||||||
|
} `json:"metadata"`
|
||||||
|
Spec struct {
|
||||||
|
NodeName string `json:"nodeName"`
|
||||||
|
} `json:"spec"`
|
||||||
|
Status struct {
|
||||||
|
Phase string `json:"phase"`
|
||||||
|
} `json:"status"`
|
||||||
|
} `json:"items"`
|
||||||
|
}
|
||||||
|
if err := kube.jsonRequest(http.MethodGet, path, nil, &payload); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
loads := map[string]int{}
|
||||||
|
for _, item := range payload.Items {
|
||||||
|
phase := strings.TrimSpace(item.Status.Phase)
|
||||||
|
if phase == "Succeeded" || phase == "Failed" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if value := strings.TrimSpace(run); value != "" && strings.TrimSpace(item.Metadata.Labels["metis-run"]) != value {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nodeName := strings.TrimSpace(item.Spec.NodeName)
|
||||||
|
if nodeName == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
loads[nodeName]++
|
||||||
|
}
|
||||||
|
return loads
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) podImageForArch(arch string) string {
|
func (a *App) podImageForArch(arch string) string {
|
||||||
switch strings.TrimSpace(arch) {
|
switch strings.TrimSpace(arch) {
|
||||||
case "arm64":
|
case "arm64":
|
||||||
|
|||||||
@ -121,3 +121,50 @@ func TestDeleteNodeObjectFallback(t *testing.T) {
|
|||||||
t.Fatalf("deleteNodeObject fallback: %v", err)
|
t.Fatalf("deleteNodeObject fallback: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestClusterActiveRemotePodLoadsCountsOnlyLivePods(t *testing.T) {
|
||||||
|
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
switch {
|
||||||
|
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
|
||||||
|
if got := r.URL.Query().Get("labelSelector"); got != "app=metis-remote,metis-run=build" {
|
||||||
|
t.Fatalf("unexpected labelSelector %q", got)
|
||||||
|
}
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||||
|
"items": []any{
|
||||||
|
map[string]any{
|
||||||
|
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
|
||||||
|
"spec": map[string]any{"nodeName": "titan-04"},
|
||||||
|
"status": map[string]any{"phase": "Running"},
|
||||||
|
},
|
||||||
|
map[string]any{
|
||||||
|
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
|
||||||
|
"spec": map[string]any{"nodeName": "titan-04"},
|
||||||
|
"status": map[string]any{"phase": "Pending"},
|
||||||
|
},
|
||||||
|
map[string]any{
|
||||||
|
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
|
||||||
|
"spec": map[string]any{"nodeName": "titan-05"},
|
||||||
|
"status": map[string]any{"phase": "Succeeded"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
default:
|
||||||
|
http.NotFound(w, r)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer kube.Close()
|
||||||
|
|
||||||
|
origFactory := kubeClientFactory
|
||||||
|
kubeClientFactory = func() (*kubeClient, error) {
|
||||||
|
return kubeClientFactoryForURL(kube.URL, kube.Client()), nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { kubeClientFactory = origFactory })
|
||||||
|
|
||||||
|
loads := clusterActiveRemotePodLoads("maintenance", "build")
|
||||||
|
if loads["titan-04"] != 2 {
|
||||||
|
t.Fatalf("expected titan-04 load 2, got %#v", loads)
|
||||||
|
}
|
||||||
|
if _, ok := loads["titan-05"]; ok {
|
||||||
|
t.Fatalf("expected succeeded pod to be ignored, got %#v", loads)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -125,6 +126,17 @@ func TestAppJobDeviceAndStateHelpers(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildAndReplaceRejectDuplicateActiveNodeJobs(t *testing.T) {
|
||||||
|
app := newTestApp(t)
|
||||||
|
active := app.newJob("replace", "titan-15", "titan-22", "/dev/sdz")
|
||||||
|
if _, err := app.Build("titan-15"); err == nil || !strings.Contains(err.Error(), active.ID) {
|
||||||
|
t.Fatalf("expected build conflict mentioning %s, got %v", active.ID, err)
|
||||||
|
}
|
||||||
|
if _, err := app.Replace("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "active replace job") {
|
||||||
|
t.Fatalf("expected replace conflict, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAppPersistenceAndTargets(t *testing.T) {
|
func TestAppPersistenceAndTargets(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
invPath := filepath.Join(dir, "inventory.yaml")
|
invPath := filepath.Join(dir, "inventory.yaml")
|
||||||
|
|||||||
@ -81,6 +81,8 @@ func (a *App) ensureDevice(host, path string) (*Device, error) {
|
|||||||
|
|
||||||
func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) {
|
func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) {
|
||||||
nodes := clusterNodes()
|
nodes := clusterNodes()
|
||||||
|
activeBuilds := clusterActiveRemotePodLoads(a.settings.Namespace, "build")
|
||||||
|
activeRemotePods := clusterActiveRemotePodLoads(a.settings.Namespace, "")
|
||||||
storageNodes := map[string]struct{}{}
|
storageNodes := map[string]struct{}{}
|
||||||
for _, node := range a.inventory.Nodes {
|
for _, node := range a.inventory.Nodes {
|
||||||
if len(node.LonghornDisks) > 0 {
|
if len(node.LonghornDisks) > 0 {
|
||||||
@ -119,6 +121,12 @@ func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) {
|
|||||||
if flashHost != "" && node.Name == flashHost {
|
if flashHost != "" && node.Name == flashHost {
|
||||||
score += 5
|
score += 5
|
||||||
}
|
}
|
||||||
|
if count := activeBuilds[node.Name]; count > 0 {
|
||||||
|
score -= 100 * count
|
||||||
|
}
|
||||||
|
if count := activeRemotePods[node.Name]; count > 0 {
|
||||||
|
score -= 15 * count
|
||||||
|
}
|
||||||
candidates = append(candidates, scored{node: node, score: score})
|
candidates = append(candidates, scored{node: node, score: score})
|
||||||
}
|
}
|
||||||
sort.Slice(candidates, func(i, j int) bool {
|
sort.Slice(candidates, func(i, j int) bool {
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
package service
|
package service
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@ -66,3 +69,60 @@ func TestSelectBuilderHostErrorBranch(t *testing.T) {
|
|||||||
t.Fatal("expected selectBuilderHost error")
|
t.Fatal("expected selectBuilderHost error")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSelectBuilderHostAvoidsBusyBuilderWhenPeersAreFree(t *testing.T) {
|
||||||
|
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
switch {
|
||||||
|
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||||
|
"items": []any{
|
||||||
|
map[string]any{
|
||||||
|
"metadata": map[string]any{
|
||||||
|
"name": "titan-04",
|
||||||
|
"labels": map[string]string{
|
||||||
|
"kubernetes.io/arch": "arm64",
|
||||||
|
"hardware": "rpi5",
|
||||||
|
"node-role.kubernetes.io/worker": "true",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"spec": map[string]any{"unschedulable": false},
|
||||||
|
},
|
||||||
|
map[string]any{
|
||||||
|
"metadata": map[string]any{
|
||||||
|
"name": "titan-05",
|
||||||
|
"labels": map[string]string{
|
||||||
|
"kubernetes.io/arch": "arm64",
|
||||||
|
"hardware": "rpi5",
|
||||||
|
"node-role.kubernetes.io/worker": "true",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"spec": map[string]any{"unschedulable": false},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||||
|
"items": []any{
|
||||||
|
map[string]any{
|
||||||
|
"metadata": map[string]any{"labels": map[string]string{"app": "metis-remote", "metis-run": "build"}},
|
||||||
|
"spec": map[string]any{"nodeName": "titan-04"},
|
||||||
|
"status": map[string]any{"phase": "Running"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
default:
|
||||||
|
http.NotFound(w, r)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer kube.Close()
|
||||||
|
installKubeFactory(t, kube)
|
||||||
|
app := newTestApp(t)
|
||||||
|
app.settings.Namespace = "maintenance"
|
||||||
|
node, err := app.selectBuilderHost("arm64", "")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("selectBuilderHost: %v", err)
|
||||||
|
}
|
||||||
|
if node.Name != "titan-05" {
|
||||||
|
t.Fatalf("expected titan-05 builder, got %s", node.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package service
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"html/template"
|
"html/template"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
@ -99,7 +100,7 @@ func (a *App) handleBuild(w http.ResponseWriter, r *http.Request) {
|
|||||||
node := values["node"]
|
node := values["node"]
|
||||||
job, err := a.Build(node)
|
job, err := a.Build(node)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
http.Error(w, err.Error(), statusForJobError(err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
writeJSON(w, http.StatusAccepted, job)
|
writeJSON(w, http.StatusAccepted, job)
|
||||||
@ -116,12 +117,20 @@ func (a *App) handleReplace(w http.ResponseWriter, r *http.Request) {
|
|||||||
device := values["device"]
|
device := values["device"]
|
||||||
job, err := a.Replace(node, host, device)
|
job, err := a.Replace(node, host, device)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
http.Error(w, err.Error(), statusForJobError(err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
writeJSON(w, http.StatusAccepted, job)
|
writeJSON(w, http.StatusAccepted, job)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func statusForJobError(err error) int {
|
||||||
|
var conflict *activeNodeJobError
|
||||||
|
if errors.As(err, &conflict) {
|
||||||
|
return http.StatusConflict
|
||||||
|
}
|
||||||
|
return http.StatusBadRequest
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) handleWatch(w http.ResponseWriter, r *http.Request) {
|
func (a *App) handleWatch(w http.ResponseWriter, r *http.Request) {
|
||||||
if r.Method != http.MethodPost {
|
if r.Method != http.MethodPost {
|
||||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||||
|
|||||||
@ -229,6 +229,21 @@ func TestRequestValuesFormAndAuthHelpers(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleBuildReturnsConflictForDuplicateActiveNodeJob(t *testing.T) {
|
||||||
|
app := newTestApp(t)
|
||||||
|
app.newJob("build", "titan-15", "", "")
|
||||||
|
handler := app.Handler()
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/api/jobs/build", strings.NewReader(`{"node":"titan-15"}`))
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
req.Header.Set("X-Auth-Request-User", "brad")
|
||||||
|
req.Header.Set("X-Auth-Request-Groups", "admin")
|
||||||
|
resp := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(resp, req)
|
||||||
|
if resp.Code != http.StatusConflict {
|
||||||
|
t.Fatalf("expected conflict, got %d: %s", resp.Code, resp.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHTTPHandlersExerciseErrorBranches(t *testing.T) {
|
func TestHTTPHandlersExerciseErrorBranches(t *testing.T) {
|
||||||
kube := fakeKubeServer(t)
|
kube := fakeKubeServer(t)
|
||||||
installKubeFactory(t, kube)
|
installKubeFactory(t, kube)
|
||||||
|
|||||||
@ -166,6 +166,8 @@ func fakeKubeServer(t *testing.T) *httptest.Server {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]any{"items": []any{}})
|
||||||
case r.Method == http.MethodPost && strings.Contains(r.URL.Path, "/pods"):
|
case r.Method == http.MethodPost && strings.Contains(r.URL.Path, "/pods"):
|
||||||
w.WriteHeader(http.StatusCreated)
|
w.WriteHeader(http.StatusCreated)
|
||||||
case r.Method == http.MethodDelete && strings.Contains(r.URL.Path, "/pods/"):
|
case r.Method == http.MethodDelete && strings.Contains(r.URL.Path, "/pods/"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user