metis/pkg/service/remote_error_test.go

424 lines
15 KiB
Go
Raw Normal View History

package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
func TestRemoteWorkflowErrorBranches(t *testing.T) {
kube := fakeKubeServer(t)
installKubeFactory(t, kube)
app := newTestApp(t)
app.settings.Namespace = "maintenance"
app.settings.RunnerImageARM64 = ""
if _, err := app.RefreshDevices("titan-22"); err == nil {
t.Fatal("expected RefreshDevices to fail without runner image")
}
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError {
t.Fatalf("expected build job error, got %#v", got)
}
job = app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil {
t.Fatal("expected flashArtifact error")
}
app.setJob(job.ID, func(j *Job) {
j.Status = JobRunning
j.Stage = "build"
j.StageStartedAt = time.Now().Add(-30 * time.Second)
})
app.heartbeatRemoteJob(job.ID)
if got := app.job(job.ID); got == nil || got.ProgressPct == 0 {
t.Fatalf("expected heartbeat progress, got %#v", got)
}
}
func TestRemoteWorkflowMissingRunnerImageBranch(t *testing.T) {
kube := fakeKubeServer(t)
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := newTestApp(t)
app.settings.Namespace = "maintenance"
app.settings.RunnerImageARM64 = ""
app.settings.HarborAPIBase = harbor.URL + "/api/v2.0"
app.settings.HarborUsername = "admin"
app.settings.HarborPassword = "pw"
app.settings.HarborProject = "metis"
app.settings.HarborRegistry = "registry.example"
app.settings.ArtifactStatePath = filepath.Join(t.TempDir(), "artifacts.json")
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError {
t.Fatalf("expected build job error, got %#v", got)
}
}
func TestRefreshDevicesDefaultSortAndErrorBranches(t *testing.T) {
t.Run("default host and deterministic sorting", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{
deviceMessage: `{"devices":[{"name":"sdc","path":"/dev/sdc","model":"Micro SD","transport":"usb","type":"disk","removable":true,"hotplug":true,"size_bytes":64000000000},{"name":"sdb","path":"/dev/sdb","model":"Micro SD","transport":"usb","type":"disk","removable":true,"hotplug":true,"size_bytes":32000000000},{"name":"sda","path":"/dev/sda","model":"Micro SD","transport":"usb","type":"disk","removable":true,"hotplug":true,"size_bytes":32000000000}]}`,
})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
devices, err := app.RefreshDevices("")
if err != nil {
t.Fatalf("RefreshDevices: %v", err)
}
if len(devices) != 3 || devices[0].Path != "/dev/sda" || devices[1].Path != "/dev/sdb" {
t.Fatalf("unexpected sorted devices: %#v", devices)
}
})
t.Run("remote pod failure records device error", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{devicePhase: "Failed", deviceMessage: "device scan failed"})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
if _, err := app.RefreshDevices("titan-22"); err == nil || !strings.Contains(err.Error(), "device scan failed") {
t.Fatalf("expected device scan failure, got %v", err)
}
if _, err := app.cachedDevices("titan-22"); err == nil || !strings.Contains(err.Error(), "device scan failed") {
t.Fatalf("expected cached device error, got %v", err)
}
})
t.Run("malformed device payload records decode error", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{deviceMessage: "{"})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
if _, err := app.RefreshDevices("titan-22"); err == nil || !strings.Contains(err.Error(), "decode remote devices") {
t.Fatalf("expected device decode failure, got %v", err)
}
})
}
func TestRunBuildAdditionalRemoteBranches(t *testing.T) {
t.Run("missing inventory node", func(t *testing.T) {
app := remoteTestApp(t, nil)
job := app.newJob("build", "missing-node", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError {
t.Fatalf("expected missing-node job error, got %#v", got)
}
})
t.Run("no eligible builder", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{nodes: []map[string]any{}})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError || !strings.Contains(got.Error, "no build host") {
t.Fatalf("expected builder selection error, got %#v", got)
}
})
t.Run("build pod failure", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{buildPhase: "Failed", buildMessage: "build crashed"})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError || !strings.Contains(got.Error, "build crashed") {
t.Fatalf("expected build pod error, got %#v", got)
}
})
t.Run("build output decode failure", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{buildMessage: "{"})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError || !strings.Contains(got.Error, "decode remote build output") {
t.Fatalf("expected build decode error, got %#v", got)
}
})
t.Run("artifact persistence failure", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
app.settings.ArtifactStatePath = t.TempDir()
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
if got := app.job(job.ID); got == nil || got.Status != JobError {
t.Fatalf("expected artifact persist error, got %#v", got)
}
})
t.Run("prune warning still completes build", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
harbor := harborPruneFailureServer(t)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("build", "titan-15", "", "")
app.runBuild(job, false)
got := app.job(job.ID)
if got == nil || got.Status != JobDone {
t.Fatalf("expected build to finish despite prune warning, got %#v", got)
}
if events := app.recentEvents(5); len(events) == 0 || events[0].Kind != "image.build" {
t.Fatalf("expected image build event, got %#v", events)
}
})
t.Run("flash preflight rejects stale device", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("replace", "titan-15", "titan-22", "/dev/sda")
app.runBuild(job, true)
if got := app.job(job.ID); got == nil || got.Status != JobError || !strings.Contains(got.Error, "not a current flash candidate") {
t.Fatalf("expected stale device error, got %#v", got)
}
})
t.Run("flash pod failure", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashPhase: "Failed", flashMessage: "flash failed"})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("replace", "titan-15", "titan-22", "/dev/sdz")
app.runBuild(job, true)
if got := app.job(job.ID); got == nil || got.Status != JobError || !strings.Contains(got.Error, "flash failed") {
t.Fatalf("expected flash pod error, got %#v", got)
}
})
t.Run("host tmp flash completion message", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
job := app.newJob("replace", "titan-15", "titan-22", hostTmpDevicePath)
app.runBuild(job, true)
if got := app.job(job.ID); got == nil || got.Status != JobDone || !strings.Contains(got.Message, "host /tmp") {
t.Fatalf("expected hosttmp completion, got %#v", got)
}
})
t.Run("node delete warning still flashes", func(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{deleteNodeStatus: http.StatusInternalServerError})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
tmp := t.TempDir()
kubectl := filepath.Join(tmp, "kubectl")
if err := os.WriteFile(kubectl, []byte("#!/usr/bin/env sh\nprintf 'delete denied' >&2\nexit 1\n"), 0o755); err != nil {
t.Fatal(err)
}
t.Setenv("PATH", tmp+string(os.PathListSeparator)+os.Getenv("PATH"))
app := remoteTestApp(t, harbor)
job := app.newJob("replace", "titan-15", "titan-22", "/dev/sdz")
app.runBuild(job, true)
if got := app.job(job.ID); got == nil || got.Status != JobDone {
t.Fatalf("expected flash success despite delete warning, got %#v", got)
}
found := false
for _, event := range app.recentEvents(10) {
if event.Kind == "node.delete.warning" {
found = true
}
}
if !found {
t.Fatalf("expected node.delete.warning event, got %#v", app.recentEvents(10))
}
})
}
func TestFlashArtifactAndHeartbeatBranches(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
job := app.newJob("replace", "titan-15", "missing-host", "/dev/sdz")
if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") {
t.Fatalf("expected missing host flashArtifact error, got %v", err)
}
app.heartbeatRemoteJob("")
app.heartbeatRemoteJob(job.ID)
if got := app.job(job.ID); got == nil || got.ProgressPct != 0 {
t.Fatalf("queued heartbeat should be a no-op, got %#v", got)
}
app.setJob(job.ID, func(j *Job) {
j.Status = JobRunning
j.Stage = "preflight"
j.Device = "/dev/sdz"
j.Host = "titan-22"
j.ProgressPct = 10
})
app.heartbeatRemoteJob(job.ID)
if got := app.job(job.ID); got == nil || got.ProgressPct != 80 || !strings.Contains(got.Message, "Validating") {
t.Fatalf("preflight heartbeat = %#v", got)
}
app.setJob(job.ID, func(j *Job) {
j.Stage = "flash"
j.ProgressPct = 80
j.Written = 120
j.Total = 100
})
app.heartbeatRemoteJob(job.ID)
if got := app.job(job.ID); got == nil || got.ProgressPct != 98 || !strings.Contains(got.Message, "Writing") {
t.Fatalf("flash byte heartbeat = %#v", got)
}
app.setJob(job.ID, func(j *Job) {
j.Stage = "flash"
j.StageStartedAt = time.Time{}
j.StartedAt = time.Now().Add(-20 * time.Second)
j.ProgressPct = 80
j.Written = 0
j.Total = 0
})
app.heartbeatRemoteJob(job.ID)
if got := app.job(job.ID); got == nil || got.ProgressPct <= 80 || !strings.Contains(got.Message, "Writing") {
t.Fatalf("flash elapsed heartbeat = %#v", got)
}
}
type remoteKubeOptions struct {
nodes []map[string]any
devicePhase string
deviceMessage string
buildPhase string
buildMessage string
flashPhase string
flashMessage string
deleteNodeStatus int
}
func remoteTestApp(t *testing.T, harbor *httptest.Server) *App {
t.Helper()
app := newTestApp(t)
app.settings.Namespace = "maintenance"
app.settings.RunnerImageARM64 = "runner:arm64"
app.settings.HarborProject = "metis"
app.settings.HarborRegistry = "registry.example"
app.settings.ArtifactStatePath = filepath.Join(t.TempDir(), "artifacts.json")
if harbor != nil {
app.settings.HarborAPIBase = harbor.URL + "/api/v2.0"
app.settings.HarborUsername = "admin"
app.settings.HarborPassword = "pw"
}
return app
}
func remoteWorkflowKubeServer(t *testing.T, opts remoteKubeOptions) *httptest.Server {
t.Helper()
devicePhase := defaultString(opts.devicePhase, "Succeeded")
deviceMessage := defaultString(opts.deviceMessage, `{"devices":[{"name":"sdz","path":"/dev/sdz","model":"Micro SD","transport":"usb","type":"disk","removable":true,"hotplug":true,"size_bytes":32000000000},{"name":"tmp","path":"hosttmp:///var/tmp/metis-flash-test","model":"Host scratch","transport":"test","type":"file","note":"Test-only host write target under /var/tmp/metis-flash-test","size_bytes":1}]}`)
buildPhase := defaultString(opts.buildPhase, "Succeeded")
buildMessage := defaultString(opts.buildMessage, `{"local_path":"/workspace/build/titan-15.img.xz","compressed":true,"size_bytes":1234,"build_tag":"build-1"}`)
flashPhase := defaultString(opts.flashPhase, "Succeeded")
flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img"}`)
nodes := opts.nodes
if nodes == nil {
nodes = []map[string]any{
{
"metadata": map[string]any{
"name": "titan-22",
"labels": map[string]string{
"kubernetes.io/arch": "arm64",
"hardware": "rpi5",
"node-role.kubernetes.io/worker": "true",
},
},
"spec": map[string]any{"unschedulable": false},
},
}
}
deleteNodeStatus := opts.deleteNodeStatus
if deleteNodeStatus == 0 {
deleteNodeStatus = http.StatusOK
}
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/nodes":
_ = json.NewEncoder(w).Encode(map[string]any{"items": nodes})
case r.Method == http.MethodGet && r.URL.Path == "/api/v1/namespaces/maintenance/pods":
_ = json.NewEncoder(w).Encode(map[string]any{"items": []any{}})
case r.Method == http.MethodPost && strings.Contains(r.URL.Path, "/pods"):
w.WriteHeader(http.StatusCreated)
case r.Method == http.MethodDelete && strings.Contains(r.URL.Path, "/nodes/"):
w.WriteHeader(deleteNodeStatus)
case r.Method == http.MethodDelete && strings.Contains(r.URL.Path, "/pods/"):
w.WriteHeader(http.StatusOK)
case r.Method == http.MethodGet && strings.Contains(r.URL.Path, "/pods/") && strings.HasSuffix(r.URL.Path, "/log"):
_, _ = w.Write([]byte("remote logs"))
case r.Method == http.MethodGet && strings.Contains(r.URL.Path, "/pods/"):
podName := filepath.Base(r.URL.Path)
phase, message := "Succeeded", "{}"
switch {
case strings.Contains(podName, "devices"):
phase, message = devicePhase, deviceMessage
case strings.Contains(podName, "build"):
phase, message = buildPhase, buildMessage
case strings.Contains(podName, "flash"):
phase, message = flashPhase, flashMessage
}
_ = json.NewEncoder(w).Encode(map[string]any{
"metadata": map[string]any{"name": podName},
"status": map[string]any{
"phase": phase,
"reason": "Completed",
"message": message,
},
})
default:
http.NotFound(w, r)
}
}))
}
func harborPruneFailureServer(t *testing.T) *httptest.Server {
t.Helper()
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasPrefix(r.URL.Path, "/api/v2.0/projects"):
_ = json.NewEncoder(w).Encode([]map[string]string{{"name": "metis"}})
case r.Method == http.MethodGet && strings.Contains(r.URL.Path, "/artifacts"):
http.Error(w, "artifact list failed", http.StatusInternalServerError)
default:
http.NotFound(w, r)
}
}))
}
func defaultString(value, fallback string) string {
if strings.TrimSpace(value) == "" {
return fallback
}
return value
}