From 56d31fa61368822f3f2234d24861111958008299 Mon Sep 17 00:00:00 2001 From: codex Date: Fri, 24 Apr 2026 12:09:53 -0300 Subject: [PATCH] metis(ui): split build and flash workflows --- cmd/metis/remote_cmd.go | 70 ------- cmd/metis/remote_flash.go | 304 +++++++++++++++++++++++++++++ cmd/metis/remote_flash_test.go | 246 +++++++++++++++++++++++ pkg/service/app.go | 25 +++ pkg/service/app_helpers.go | 2 +- pkg/service/coverage_more_test.go | 2 +- pkg/service/flash_workflow_test.go | 182 +++++++++++++++++ pkg/service/remote.go | 171 +++++++++++----- pkg/service/remote_error_test.go | 8 +- pkg/service/remote_helpers.go | 31 ++- pkg/service/remote_helpers_test.go | 28 +++ pkg/service/remote_status.go | 18 ++ pkg/service/server.go | 18 ++ pkg/service/templates/metis.html | 55 +++++- pkg/service/workflow_test.go | 2 +- scripts/quality_gate.sh | 2 +- testing/coverage_policy.json | 1 + 17 files changed, 1021 insertions(+), 144 deletions(-) create mode 100644 cmd/metis/remote_flash.go create mode 100644 cmd/metis/remote_flash_test.go create mode 100644 pkg/service/flash_workflow_test.go diff --git a/cmd/metis/remote_cmd.go b/cmd/metis/remote_cmd.go index f1f33b8..372ba57 100644 --- a/cmd/metis/remote_cmd.go +++ b/cmd/metis/remote_cmd.go @@ -163,76 +163,6 @@ func remoteBuildCmd(args []string) { writeStructuredResult(summary) } -func remoteFlashCmd(args []string) { - fs := flag.NewFlagSet("remote-flash", flag.ExitOnError) - node := fs.String("node", "", "target node") - device := fs.String("device", "", "target device path or test sink") - artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag") - workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory") - harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host") - harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username") - harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password") - hostTmpDir := fs.String("host-tmp-dir", "/host-tmp", "mounted host tmp dir for test writes") - fs.Parse(args) - if *node == "" || *device == "" || *artifactRef == "" { - fatalf("--node, --device, and --artifact-ref are required") - } - - if err := os.MkdirAll(*workDir, 0o755); err != nil { - fatalf("mkdir workdir: %v", err) - } - emitStageProgress("flash", 84, fmt.Sprintf("Pulling the latest Harbor artifact for %s", *node)) - if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil { - fatalf("oras login: %v", err) - } - if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil { - fatalf("oras pull: %v", err) - } - emitStageProgress("flash", 88, fmt.Sprintf("Preparing the downloaded image for %s", *node)) - imagePath, compressed, err := resolvePulledArtifact(*workDir) - if err != nil { - fatalf("resolve artifact: %v", err) - } - rawImage := imagePath - if compressed { - emitStageProgress("flash", 90, fmt.Sprintf("Decompressing the image for %s before writing", *node)) - rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node)) - cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage)) - if out, err := cmd.CombinedOutput(); err != nil { - fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out))) - } - } - - destPath := *device - if strings.HasPrefix(destPath, "hosttmp://") { - if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil { - fatalf("mkdir host tmp dir: %v", err) - } - destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node)) - } - emitStageProgress("flash", 92, fmt.Sprintf("Writing the latest image for %s to %s", *node, destPath)) - writeEmitter := newProgressEmitter("flash", 92, 98, fmt.Sprintf("Writing the latest image for %s", *node), true) - if err := writer.WriteImageWithProgress(context.Background(), rawImage, destPath, writeEmitter); err != nil { - fatalf("write image: %v", err) - } - emitStageProgress("flash", 99, fmt.Sprintf("Flushing the finished image for %s", *node)) - _ = exec.Command("sync").Run() - if strings.HasPrefix(destPath, "/dev/") { - _ = exec.Command("blockdev", "--flushbufs", destPath).Run() - } - - info, err := os.Stat(destPath) - if err != nil { - fatalf("stat destination: %v", err) - } - writeStructuredResult(map[string]any{ - "node": *node, - "device": *device, - "dest_path": destPath, - "size_bytes": info.Size(), - }) -} - func writeStructuredResult(payload any) { data, err := json.Marshal(payload) if err != nil { diff --git a/cmd/metis/remote_flash.go b/cmd/metis/remote_flash.go new file mode 100644 index 0000000..9839a71 --- /dev/null +++ b/cmd/metis/remote_flash.go @@ -0,0 +1,304 @@ +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "metis/pkg/service" + "metis/pkg/writer" +) + +type flashPartitionTable struct { + PartitionTable struct { + Partitions []flashTablePartition `json:"partitions"` + } `json:"partitiontable"` +} + +type flashTablePartition struct { + Type string `json:"type"` +} + +type flashBlockDevicePayload struct { + Blockdevices []flashBlockDevice `json:"blockdevices"` +} + +type flashBlockDevice struct { + Path string `json:"path"` + Type string `json:"type"` + FSType string `json:"fstype"` + Label string `json:"label"` + Children []flashBlockDevice `json:"children"` +} + +var requiredBootFiles = []string{"config.txt", "cmdline.txt", "boot.scr"} + +func remoteFlashCmd(args []string) { + fs := flag.NewFlagSet("remote-flash", flag.ExitOnError) + node := fs.String("node", "", "target node") + device := fs.String("device", "", "target device path or test sink") + artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag") + workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory") + harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host") + harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username") + harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password") + hostTmpDir := fs.String("host-tmp-dir", "/host-tmp", "mounted host tmp dir for test writes") + fs.Parse(args) + if *node == "" || *device == "" || *artifactRef == "" { + fatalf("--node, --device, and --artifact-ref are required") + } + + if err := os.MkdirAll(*workDir, 0o755); err != nil { + fatalf("mkdir workdir: %v", err) + } + emitStageProgress("flash_pull", 84, fmt.Sprintf("Pulling the latest Harbor artifact for %s", *node)) + if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil { + fatalf("oras login: %v", err) + } + if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil { + fatalf("oras pull: %v", err) + } + emitStageProgress("flash_prepare", 88, fmt.Sprintf("Preparing the downloaded image for %s", *node)) + imagePath, compressed, err := resolvePulledArtifact(*workDir) + if err != nil { + fatalf("resolve artifact: %v", err) + } + rawImage := imagePath + if compressed { + emitStageProgress("flash_unpack", 90, fmt.Sprintf("Decompressing the image for %s before writing", *node)) + rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node)) + cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage)) + if out, err := cmd.CombinedOutput(); err != nil { + fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out))) + } + } + + destPath := *device + if strings.HasPrefix(destPath, "hosttmp://") { + if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil { + fatalf("mkdir host tmp dir: %v", err) + } + destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node)) + } + info, err := os.Stat(rawImage) + if err != nil { + fatalf("stat raw image: %v", err) + } + imageSize := info.Size() + emitStageProgress("flash_write", 92, fmt.Sprintf("Writing the latest image for %s to %s", *node, destPath)) + writeEmitter := newProgressEmitter("flash_write", 92, 98, fmt.Sprintf("Writing the latest image for %s", *node), true) + if err := writer.WriteImageWithProgress(context.Background(), rawImage, destPath, writeEmitter); err != nil { + fatalf("write image: %v", err) + } + flushFlashTarget(destPath, *node) + result, err := verifyFlashDestination(destPath) + if err != nil { + fatalf("verify flash output: %v", err) + } + result.Node = *node + result.Device = *device + result.DestPath = destPath + result.SizeBytes = imageSize + writeStructuredResult(result) +} + +func flushFlashTarget(destPath, node string) { + emitStageProgress("flash_flush", 98.5, fmt.Sprintf("Flushing the finished image for %s", node)) + _ = exec.Command("sync").Run() + if !strings.HasPrefix(destPath, "/dev/") { + return + } + _ = exec.Command("blockdev", "--flushbufs", destPath).Run() + _ = exec.Command("blockdev", "--rereadpt", destPath).Run() + _ = exec.Command("partprobe", destPath).Run() + _ = exec.Command("udevadm", "settle", "--timeout=10").Run() +} + +func verifyFlashDestination(destPath string) (service.RemoteFlashResult, error) { + emitStageProgress("flash_verify", 99.2, fmt.Sprintf("Verifying the flashed recovery media at %s", destPath)) + if strings.HasPrefix(destPath, "/dev/") { + return verifyBlockDeviceFlash(destPath) + } + return verifyImageFileFlash(destPath) +} + +func verifyImageFileFlash(destPath string) (service.RemoteFlashResult, error) { + table, err := readFlashPartitionTable(destPath) + if err != nil { + return service.RemoteFlashResult{}, err + } + hasBoot := false + hasRoot := false + for _, part := range table.PartitionTable.Partitions { + if isBootPartitionType(part.Type) { + hasBoot = true + } + if isLinuxPartitionType(part.Type) { + hasRoot = true + } + } + if !hasBoot || !hasRoot { + return service.RemoteFlashResult{}, fmt.Errorf("image %s does not expose the expected boot and writable partitions", destPath) + } + return service.RemoteFlashResult{ + Verified: true, + VerificationKind: "image-file", + VerificationSummary: fmt.Sprintf("Verified image layout at %s; boot and writable partitions are present.", destPath), + }, nil +} + +func verifyBlockDeviceFlash(destPath string) (service.RemoteFlashResult, error) { + deadline := time.Now().Add(15 * time.Second) + var lastErr error + for time.Now().Before(deadline) { + parts, err := readBlockDevicePartitions(destPath) + if err == nil { + boot, root, classifyErr := classifyFlashPartitions(parts) + if classifyErr == nil { + checkedFiles, verifyErr := verifyBootPartitionFiles(boot.Path) + if verifyErr == nil { + bootLabel := firstNonEmpty(boot.Label, filepath.Base(boot.Path)) + rootLabel := firstNonEmpty(root.Label, filepath.Base(root.Path)) + return service.RemoteFlashResult{ + Verified: true, + VerificationKind: "block-device", + VerificationSummary: fmt.Sprintf("Verified %s; %s and %s are present and the boot files look correct.", destPath, bootLabel, rootLabel), + BootPartition: boot.Path, + RootPartition: root.Path, + BootLabel: boot.Label, + RootLabel: root.Label, + BootFSType: boot.FSType, + RootFSType: root.FSType, + CheckedFiles: checkedFiles, + }, nil + } + lastErr = verifyErr + } else { + lastErr = classifyErr + } + } else { + lastErr = err + } + _ = exec.Command("blockdev", "--rereadpt", destPath).Run() + _ = exec.Command("partprobe", destPath).Run() + _ = exec.Command("udevadm", "settle", "--timeout=10").Run() + time.Sleep(time.Second) + } + if lastErr == nil { + lastErr = fmt.Errorf("timed out waiting for the flashed partitions on %s", destPath) + } + return service.RemoteFlashResult{}, lastErr +} + +func readFlashPartitionTable(destPath string) (flashPartitionTable, error) { + out, err := exec.Command("sfdisk", "-J", destPath).CombinedOutput() + if err != nil { + return flashPartitionTable{}, fmt.Errorf("sfdisk -J %s: %v: %s", destPath, err, strings.TrimSpace(string(out))) + } + var table flashPartitionTable + if err := json.Unmarshal(out, &table); err != nil { + return flashPartitionTable{}, fmt.Errorf("decode partition table for %s: %w", destPath, err) + } + return table, nil +} + +func readBlockDevicePartitions(destPath string) ([]flashBlockDevice, error) { + out, err := exec.Command("lsblk", "-J", "-o", "PATH,TYPE,FSTYPE,LABEL", destPath).CombinedOutput() + if err != nil { + return nil, fmt.Errorf("lsblk %s: %v: %s", destPath, err, strings.TrimSpace(string(out))) + } + var payload flashBlockDevicePayload + if err := json.Unmarshal(out, &payload); err != nil { + return nil, fmt.Errorf("decode lsblk output for %s: %w", destPath, err) + } + for _, device := range payload.Blockdevices { + if device.Path == destPath { + return device.Children, nil + } + } + if len(payload.Blockdevices) == 1 { + return payload.Blockdevices[0].Children, nil + } + return nil, fmt.Errorf("lsblk did not report partitions for %s", destPath) +} + +func classifyFlashPartitions(parts []flashBlockDevice) (flashBlockDevice, flashBlockDevice, error) { + var boot flashBlockDevice + var root flashBlockDevice + for _, part := range parts { + if part.Type != "part" { + continue + } + normalizedLabel := strings.ToLower(strings.TrimSpace(part.Label)) + normalizedFS := strings.ToLower(strings.TrimSpace(part.FSType)) + if boot.Path == "" && (normalizedLabel == "system-boot" || normalizedFS == "vfat" || normalizedFS == "fat" || normalizedFS == "fat32") { + boot = part + continue + } + if root.Path == "" && (normalizedLabel == "writable" || normalizedFS == "ext4") { + root = part + } + } + if boot.Path == "" { + return flashBlockDevice{}, flashBlockDevice{}, fmt.Errorf("could not find a boot partition with a FAT filesystem") + } + if root.Path == "" { + return flashBlockDevice{}, flashBlockDevice{}, fmt.Errorf("could not find a writable ext4 partition") + } + return boot, root, nil +} + +func verifyBootPartitionFiles(partitionPath string) ([]string, error) { + mountDir, err := os.MkdirTemp("", "metis-boot-") + if err != nil { + return nil, err + } + defer os.RemoveAll(mountDir) + cmd := exec.Command("mount", "-o", "ro", partitionPath, mountDir) + if out, err := cmd.CombinedOutput(); err != nil { + return nil, fmt.Errorf("mount %s read-only: %v: %s", partitionPath, err, strings.TrimSpace(string(out))) + } + defer exec.Command("umount", mountDir).Run() + + checked := make([]string, 0, len(requiredBootFiles)) + for _, name := range requiredBootFiles { + if _, err := os.Stat(filepath.Join(mountDir, name)); err != nil { + return nil, fmt.Errorf("boot partition %s is missing %s", partitionPath, name) + } + checked = append(checked, name) + } + return checked, nil +} + +func isBootPartitionType(partType string) bool { + normalized := strings.ToLower(strings.TrimSpace(partType)) + switch normalized { + case "b", "c", "0b", "0c", "ef", "ef00": + return true + } + return normalized == "c12a7328-f81f-11d2-ba4b-00a0c93ec93b" +} + +func isLinuxPartitionType(partType string) bool { + normalized := strings.ToLower(strings.TrimSpace(partType)) + switch normalized { + case "83", "8300": + return true + } + return normalized == "0fc63daf-8483-4772-8e79-3d69d8477de4" +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if trimmed := strings.TrimSpace(value); trimmed != "" { + return trimmed + } + } + return "" +} diff --git a/cmd/metis/remote_flash_test.go b/cmd/metis/remote_flash_test.go new file mode 100644 index 0000000..3514470 --- /dev/null +++ b/cmd/metis/remote_flash_test.go @@ -0,0 +1,246 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestVerifyFlashDestinationForImageFilesAndBlockDevices(t *testing.T) { + t.Run("image file success and helpers", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "sfdisk": `cat <<'JSON' +{"partitiontable":{"partitions":[{"type":"ef00"},{"type":"8300"}]}} +JSON`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + + result, err := verifyFlashDestination(filepath.Join(t.TempDir(), "titan-15.img")) + if err != nil { + t.Fatalf("verifyFlashDestination(image): %v", err) + } + if !result.Verified || result.VerificationKind != "image-file" { + t.Fatalf("unexpected image verification result: %#v", result) + } + if !isBootPartitionType("ef00") || !isLinuxPartitionType("8300") { + t.Fatal("expected helper partition type recognizers to accept GPT aliases") + } + if got := firstNonEmpty("", " writable ", "ignored"); got != "writable" { + t.Fatalf("firstNonEmpty = %q", got) + } + }) + + t.Run("image file missing writable partition", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "sfdisk": `cat <<'JSON' +{"partitiontable":{"partitions":[{"type":"ef00"}]}} +JSON`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + + if _, err := verifyImageFileFlash(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "boot and writable partitions") { + t.Fatalf("expected missing partition error, got %v", err) + } + }) + + t.Run("image file missing boot partition", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "sfdisk": `cat <<'JSON' +{"partitiontable":{"partitions":[{"type":"8300"}]}} +JSON`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + + if _, err := verifyImageFileFlash(filepath.Join(t.TempDir(), "broken-boot.img")); err == nil || !strings.Contains(err.Error(), "boot and writable partitions") { + t.Fatalf("expected missing boot partition error, got %v", err) + } + }) + + t.Run("block device success", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "lsblk": `cat <<'JSON' +{"blockdevices":[{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"},{"path":"/dev/sdk2","type":"part","fstype":"ext4","label":"writable"}]}]} +JSON`, + "mount": `mount_dir="${4:-}" +mkdir -p "${mount_dir}" +printf 'ok' > "${mount_dir}/config.txt" +printf 'console=ttyAMA0' > "${mount_dir}/cmdline.txt" +printf 'boot script' > "${mount_dir}/boot.scr"`, + "umount": `exit 0`, + "blockdev": `exit 0`, + "partprobe": `exit 0`, + "udevadm": `exit 0`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + + result, err := verifyFlashDestination("/dev/sdk") + if err != nil { + t.Fatalf("verifyFlashDestination(block): %v", err) + } + if !result.Verified || result.VerificationKind != "block-device" { + t.Fatalf("unexpected block-device verification result: %#v", result) + } + if result.BootPartition != "/dev/sdk1" || result.RootPartition != "/dev/sdk2" { + t.Fatalf("unexpected partition details: %#v", result) + } + if len(result.CheckedFiles) != 3 { + t.Fatalf("expected checked boot files, got %#v", result.CheckedFiles) + } + }) + + t.Run("block device missing boot file", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "lsblk": `cat <<'JSON' +{"blockdevices":[{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"},{"path":"/dev/sdk2","type":"part","fstype":"ext4","label":"writable"}]}]} +JSON`, + "mount": `mount_dir="${4:-}" +mkdir -p "${mount_dir}" +printf 'ok' > "${mount_dir}/config.txt" +printf 'console=ttyAMA0' > "${mount_dir}/cmdline.txt"`, + "umount": `exit 0`, + "blockdev": `exit 0`, + "partprobe": `exit 0`, + "udevadm": `exit 0`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + + if _, err := verifyBlockDeviceFlash("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "missing boot.scr") { + t.Fatalf("expected missing boot file error, got %v", err) + } + }) +} + +func TestFlushFlashTargetAndClassifyHelpers(t *testing.T) { + logPath := filepath.Join(t.TempDir(), "commands.log") + tools := fakeCommandDir(t, map[string]string{ + "sync": `printf 'sync %s\n' "$*" >> "` + logPath + `"`, + "blockdev": `printf 'blockdev %s\n' "$*" >> "` + logPath + `"`, + "partprobe": `printf 'partprobe %s\n' "$*" >> "` + logPath + `"`, + "udevadm": `printf 'udevadm %s\n' "$*" >> "` + logPath + `"`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + + flushFlashTarget("/tmp/test.img", "titan-15") + flushFlashTarget("/dev/sdk", "titan-15") + + logged, err := os.ReadFile(logPath) + if err != nil { + t.Fatalf("read flush log: %v", err) + } + text := string(logged) + if strings.Count(text, "sync ") != 2 { + t.Fatalf("expected sync for file and block targets, got %q", text) + } + for _, want := range []string{"blockdev --flushbufs /dev/sdk", "blockdev --rereadpt /dev/sdk", "partprobe /dev/sdk", "udevadm settle --timeout=10"} { + if !strings.Contains(text, want) { + t.Fatalf("expected %q in %q", want, text) + } + } + + if _, _, err := classifyFlashPartitions([]flashBlockDevice{{Path: "/dev/sdk1", Type: "part", FSType: "vfat", Label: "system-boot"}}); err == nil || !strings.Contains(err.Error(), "writable ext4") { + t.Fatalf("expected classify failure, got %v", err) + } +} + +func TestReadFlashPartitionTableAndBlockPartitionErrors(t *testing.T) { + t.Run("partition table decode error", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{"sfdisk": `printf '{'`}) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + if _, err := readFlashPartitionTable(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "decode partition table") { + t.Fatalf("expected partition table decode error, got %v", err) + } + }) + + t.Run("lsblk decode error", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{"lsblk": `printf '{'`}) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "decode lsblk output") { + t.Fatalf("expected lsblk decode error, got %v", err) + } + }) + + t.Run("lsblk finds explicit device match", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "lsblk": `cat <<'JSON' +{"blockdevices":[{"path":"/dev/other","children":[{"path":"/dev/other1","type":"part"}]},{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"}]}]} +JSON`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + parts, err := readBlockDevicePartitions("/dev/sdk") + if err != nil { + t.Fatalf("readBlockDevicePartitions match: %v", err) + } + if len(parts) != 1 || parts[0].Path != "/dev/sdk1" { + t.Fatalf("unexpected matching partitions: %#v", parts) + } + }) + + t.Run("lsblk missing device match", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "lsblk": `cat <<'JSON' +{"blockdevices":[{"path":"/dev/other","children":[]},{"path":"/dev/another","children":[]}]} +JSON`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "did not report partitions") { + t.Fatalf("expected missing block-device match error, got %v", err) + } + }) +} + +func TestVerifyBootPartitionFilesMountFailure(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "mount": `printf 'mount failed' >&2; exit 9`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + if _, err := verifyBootPartitionFiles("/dev/sdk1"); err == nil || !strings.Contains(err.Error(), "mount /dev/sdk1 read-only") { + t.Fatalf("expected mount failure, got %v", err) + } +} + +func TestRemoteFlashHelperAdditionalFailureBranches(t *testing.T) { + t.Run("partition table command failure", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{"sfdisk": `printf 'bad disk' >&2; exit 7`}) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + if _, err := readFlashPartitionTable(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "sfdisk -J") { + t.Fatalf("expected sfdisk failure, got %v", err) + } + }) + + t.Run("lsblk command failure", func(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{"lsblk": `printf 'lsblk failed' >&2; exit 4`}) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "lsblk /dev/sdk") { + t.Fatalf("expected lsblk command failure, got %v", err) + } + }) + + t.Run("boot partition classification failure", func(t *testing.T) { + if _, _, err := classifyFlashPartitions([]flashBlockDevice{{Path: "/dev/sdk2", Type: "part", FSType: "ext4", Label: "writable"}}); err == nil || !strings.Contains(err.Error(), "boot partition") { + t.Fatalf("expected missing boot partition error, got %v", err) + } + }) + + t.Run("firstNonEmpty returns empty when every value is blank", func(t *testing.T) { + if got := firstNonEmpty("", " "); got != "" { + t.Fatalf("expected empty firstNonEmpty result, got %q", got) + } + }) +} + +func TestReadBlockDevicePartitionsFallsBackToSingleReturnedDevice(t *testing.T) { + tools := fakeCommandDir(t, map[string]string{ + "lsblk": `cat <<'JSON' +{"blockdevices":[{"path":"/dev/mmcblk0","children":[{"path":"/dev/mmcblk0p1","type":"part","fstype":"vfat","label":"system-boot"}]}]} +JSON`, + }) + t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH")) + parts, err := readBlockDevicePartitions("/dev/sdk") + if err != nil { + t.Fatalf("readBlockDevicePartitions fallback: %v", err) + } + if len(parts) != 1 || parts[0].Path != "/dev/mmcblk0p1" { + t.Fatalf("unexpected fallback partitions: %#v", parts) + } +} diff --git a/pkg/service/app.go b/pkg/service/app.go index a7bed80..c5c6652 100644 --- a/pkg/service/app.go +++ b/pkg/service/app.go @@ -221,6 +221,31 @@ func (a *App) Build(node string) (*Job, error) { return job, nil } +// Flash starts a background flash-only workflow for the latest published node image. +func (a *App) Flash(node, host, device string) (*Job, error) { + if host == "" { + host = a.settings.DefaultFlashHost + } + if err := a.ensureReplacementReady(node); err != nil { + return nil, err + } + if active := a.activeJobForNode(node); active != nil { + return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID} + } + if summary, ok := a.artifacts()[node]; !ok || strings.TrimSpace(summary.Ref) == "" { + return nil, fmt.Errorf("no published image recorded for %s yet; run a build first", node) + } + if _, err := a.ensureDevice(host, device); err != nil { + return nil, err + } + job, err := a.reserveJob("flash", node, host, device) + if err != nil { + return nil, err + } + go a.runFlash(job) + return job, nil +} + // Replace starts a background build+flash workflow for a node. func (a *App) Replace(node, host, device string) (*Job, error) { if host == "" { diff --git a/pkg/service/app_helpers.go b/pkg/service/app_helpers.go index be6f8c3..38ed3cb 100644 --- a/pkg/service/app_helpers.go +++ b/pkg/service/app_helpers.go @@ -68,7 +68,7 @@ func (a *App) activeJobForNodeLocked(node string) *Job { continue } switch job.Kind { - case "build", "replace": + case "build", "flash", "replace": default: continue } diff --git a/pkg/service/coverage_more_test.go b/pkg/service/coverage_more_test.go index 92a8120..c0fc9e3 100644 --- a/pkg/service/coverage_more_test.go +++ b/pkg/service/coverage_more_test.go @@ -407,7 +407,7 @@ func TestServiceClusterAndRemotePodBranches(t *testing.T) { "metadata": map[string]any{"name": filepath.Base(r.URL.Path)}, "status": map[string]any{ "phase": "Succeeded", - "message": `{"dest_path":"/tmp/out.img"}`, + "message": `{"dest_path":"/tmp/out.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /tmp/out.img; boot and writable partitions are present."}`, "reason": "Completed", }, }) diff --git a/pkg/service/flash_workflow_test.go b/pkg/service/flash_workflow_test.go new file mode 100644 index 0000000..bd19a6e --- /dev/null +++ b/pkg/service/flash_workflow_test.go @@ -0,0 +1,182 @@ +package service + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestFlashOnlyWorkflowRequiresPublishedArtifact(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + + if _, err := app.Flash("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "run a build first") { + t.Fatalf("expected missing artifact error, got %v", err) + } +} + +func TestFlashOnlyWorkflowCompletesAndRecordsVerification(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","verified":true,"verification_kind":"block-device","verification_summary":"Verified /dev/sdz; system-boot and writable are present and the boot files look correct.","boot_partition":"/dev/sdz1","root_partition":"/dev/sdz2","boot_label":"system-boot","root_label":"writable","boot_fstype":"vfat","root_fstype":"ext4","checked_files":["config.txt","cmdline.txt","boot.scr"]}`}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"} + + job, err := app.Flash("titan-15", "titan-22", "/dev/sdz") + if err != nil { + t.Fatalf("Flash: %v", err) + } + waitForJobState(t, app, job.ID, JobDone) + + got := app.job(job.ID) + if got == nil || got.Kind != "flash" || got.Status != JobDone { + t.Fatalf("unexpected flash job: %#v", got) + } + if !strings.Contains(got.Message, "Move the card into titan-15") { + t.Fatalf("unexpected completion message: %#v", got) + } + if got.ProgressPct != 100 { + t.Fatalf("expected complete flash progress, got %#v", got) + } + + events := app.recentEvents(5) + if len(events) == 0 || events[0].Kind != "image.flash" { + t.Fatalf("expected flash event, got %#v", events) + } + if verified, ok := events[0].Details["verified"].(bool); !ok || !verified { + t.Fatalf("expected verified flash details, got %#v", events[0].Details) + } + if kind, ok := events[0].Details["verification_kind"].(string); !ok || kind != "block-device" { + t.Fatalf("expected block-device verification, got %#v", events[0].Details) + } + if bootPartition, ok := events[0].Details["boot_partition"].(string); !ok || bootPartition != "/dev/sdz1" { + t.Fatalf("expected boot partition details, got %#v", events[0].Details) + } +} + +func TestHandleFlashRouteStartsFlashOnlyJob(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"} + handler := app.Handler() + + req := httptest.NewRequest(http.MethodPost, "/api/jobs/flash", strings.NewReader(`{"node":"titan-15","host":"titan-22","device":"/dev/sdz"}`)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Auth-Request-User", "brad") + req.Header.Set("X-Auth-Request-Groups", "admin") + resp := httptest.NewRecorder() + handler.ServeHTTP(resp, req) + if resp.Code != http.StatusAccepted { + t.Fatalf("flash route response: %d %s", resp.Code, resp.Body.String()) + } + + var job Job + if err := json.Unmarshal(resp.Body.Bytes(), &job); err != nil { + t.Fatalf("decode flash job: %v", err) + } + waitForJobState(t, app, job.ID, JobDone) + if got := app.job(job.ID); got == nil || got.Kind != "flash" { + t.Fatalf("expected flash job, got %#v", got) + } +} + +func TestFlashAndReplaceDefaultHostBranches(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{}) + harbor := fakeHarborServer(t, true) + installKubeFactory(t, kube) + app := remoteTestApp(t, harbor) + app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"} + + flashJob, err := app.Flash("titan-15", "", hostTmpDevicePath) + if err != nil { + t.Fatalf("Flash default host: %v", err) + } + waitForJobState(t, app, flashJob.ID, JobDone) + if got := app.job(flashJob.ID); got == nil || got.Host != "titan-22" || !strings.Contains(got.Message, "host /tmp") { + t.Fatalf("unexpected flash default host job: %#v", got) + } + + replaceJob, err := app.Replace("titan-15", "", hostTmpDevicePath) + if err != nil { + t.Fatalf("Replace default host: %v", err) + } + waitForJobState(t, app, replaceJob.ID, JobDone) + if got := app.job(replaceJob.ID); got == nil || got.Host != "titan-22" { + t.Fatalf("unexpected replace default host job: %#v", got) + } +} + +func TestFlashRejectsDuplicateActiveNodeJob(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"} + app.newJob("flash", "titan-15", "titan-22", "/dev/sdz") + + if _, err := app.Flash("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "already has an active") { + t.Fatalf("expected flash conflict, got %v", err) + } +} + +func TestFlashArtifactUpdatesProgressWhenVerificationSummaryIsMissing(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","size_bytes":4096,"verified":true}`}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + job := app.newJob("flash", "titan-15", "titan-22", "/dev/sdz") + + result, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15") + if err != nil { + t.Fatalf("flashArtifact: %v", err) + } + if result.SizeBytes != 4096 { + t.Fatalf("unexpected flash result: %#v", result) + } + got := app.job(job.ID) + if got == nil || got.Written != 4096 || got.Total != 4096 || !strings.Contains(got.Message, "Verified the latest image write") { + t.Fatalf("unexpected flash artifact state: %#v", got) + } +} + +func TestFlashOnlyWorkflowFailsVerification(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","verified":false}`}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"} + + job, err := app.Flash("titan-15", "titan-22", "/dev/sdz") + if err != nil { + t.Fatalf("Flash: %v", err) + } + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + if got := app.job(job.ID); got != nil && got.Status == JobError { + if !strings.Contains(got.Error, "verification did not succeed") { + t.Fatalf("unexpected flash verification error: %#v", got) + } + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("flash job %s never failed verification: %#v", job.ID, app.job(job.ID)) +} + +func TestBuildFlashAndReplaceRejectInvalidInputs(t *testing.T) { + kube := remoteWorkflowKubeServer(t, remoteKubeOptions{}) + installKubeFactory(t, kube) + app := remoteTestApp(t, nil) + app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"} + + if _, err := app.Build("missing-node"); err == nil || !strings.Contains(err.Error(), "missing-node") { + t.Fatalf("expected build inventory error, got %v", err) + } + if _, err := app.Flash("titan-15", "titan-22", "/dev/sda"); err == nil || !strings.Contains(err.Error(), "not a current flash candidate") { + t.Fatalf("expected flash device validation error, got %v", err) + } + if _, err := app.Replace("titan-15", "titan-22", "/dev/sda"); err == nil || !strings.Contains(err.Error(), "not a current flash candidate") { + t.Fatalf("expected replace device validation error, got %v", err) + } +} diff --git a/pkg/service/remote.go b/pkg/service/remote.go index b7399d9..a1b29a0 100644 --- a/pkg/service/remote.go +++ b/pkg/service/remote.go @@ -74,7 +74,7 @@ func (a *App) RefreshDevices(host string) ([]Device, error) { } func (a *App) runBuild(job *Job, flash bool) { - nodeSpec, class, err := a.inventory.FindNode(job.Node) + _, class, err := a.inventory.FindNode(job.Node) if err != nil { a.failJob(job.ID, err) a.metrics.RecordBuild(job.Node, "error") @@ -158,18 +158,42 @@ func (a *App) runBuild(job *Job, flash bool) { return } - a.setJob(job.ID, func(j *Job) { - j.Stage = "preflight" - j.StageStartedAt = time.Now().UTC() - j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host) - j.ProgressPct = 78 - j.Artifact = artifactRef + ":latest" - }) - if _, err := a.ensureDevice(job.Host, job.Device); err != nil { + result, err := a.runFlashSequence(job, artifactRef) + if err != nil { a.failJob(job.ID, err) a.metrics.RecordFlash(job.Node, job.Host, "error") return } + a.metrics.RecordFlash(job.Node, job.Host, "ok") + a.appendFlashEvent(job, artifactRef, result) + a.completeFlashJob(job.ID, artifactRef, result) +} + +func (a *App) runFlash(job *Job) { + artifactRef := a.artifactRepo(job.Node) + result, err := a.runFlashSequence(job, artifactRef) + if err != nil { + a.failJob(job.ID, err) + a.metrics.RecordFlash(job.Node, job.Host, "error") + return + } + a.metrics.RecordFlash(job.Node, job.Host, "ok") + a.appendFlashEvent(job, artifactRef, result) + a.completeFlashJob(job.ID, artifactRef, result) +} + +func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) { + a.setJob(job.ID, func(j *Job) { + j.Status = JobRunning + j.Stage = "preflight" + j.StageStartedAt = time.Now().UTC() + j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host) + j.ProgressPct = 80 + j.Artifact = artifactRef + ":latest" + }) + if _, err := a.ensureDevice(job.Host, job.Device); err != nil { + return RemoteFlashResult{}, err + } if !strings.HasPrefix(job.Device, "hosttmp://") { if err := deleteNodeObject(job.Node); err != nil { a.appendEvent(Event{ @@ -180,66 +204,111 @@ func (a *App) runBuild(job *Job, flash bool) { }) } } - if err := a.flashArtifact(job.ID, artifactRef); err != nil { - a.failJob(job.ID, err) - a.metrics.RecordFlash(job.Node, job.Host, "error") - return + return a.flashArtifact(job.ID, artifactRef) +} + +func (a *App) appendFlashEvent(job *Job, artifactRef string, result RemoteFlashResult) { + summary := fmt.Sprintf("Flashed and verified %s latest image on %s", job.Node, job.Host) + if strings.HasPrefix(job.Device, "hosttmp://") { + summary = fmt.Sprintf("Verified %s latest image on %s host scratch", job.Node, job.Host) + } + details := map[string]any{ + "node": job.Node, + "device": job.Device, + "host": job.Host, + "artifact": artifactRef + ":latest", + "dest_path": result.DestPath, + "verified": result.Verified, + "verification_kind": result.VerificationKind, + } + if strings.TrimSpace(result.BootPartition) != "" { + details["boot_partition"] = result.BootPartition + } + if strings.TrimSpace(result.RootPartition) != "" { + details["root_partition"] = result.RootPartition + } + if len(result.CheckedFiles) > 0 { + details["checked_files"] = append([]string{}, result.CheckedFiles...) } - a.metrics.RecordFlash(job.Node, job.Host, "ok") a.appendEvent(Event{ Time: time.Now().UTC(), Kind: "image.flash", - Summary: fmt.Sprintf("Flashed %s latest image on %s", job.Node, job.Host), - Details: map[string]any{"node": job.Node, "device": job.Device, "host": job.Host, "artifact": artifactRef + ":latest"}, + Summary: summary, + Details: details, }) - a.completeJob(job.ID, func(j *Job) { - j.Stage = "complete" - if strings.HasPrefix(j.Device, "hosttmp://") { - j.Message = fmt.Sprintf("Test flash complete on %s host /tmp.", j.Host) - } else { - j.Message = fmt.Sprintf("Flash complete on %s. Move the card into %s and power-cycle it.", j.Host, j.Node) - } - j.ProgressPct = 100 - j.Artifact = artifactRef + ":latest" - }) - - _ = nodeSpec } -func (a *App) flashArtifact(jobID, artifactRef string) error { +func (a *App) completeFlashJob(jobID, artifactRef string, result RemoteFlashResult) { + a.completeJob(jobID, func(j *Job) { + j.Stage = "complete" + j.ProgressPct = 100 + j.Artifact = artifactRef + ":latest" + if strings.HasPrefix(j.Device, "hosttmp://") { + if strings.TrimSpace(result.VerificationSummary) != "" { + j.Message = fmt.Sprintf("%s Test flash complete on %s host /tmp.", strings.TrimRight(result.VerificationSummary, "."), j.Host) + return + } + j.Message = fmt.Sprintf("Verified test flash on %s host /tmp.", j.Host) + return + } + if strings.TrimSpace(result.VerificationSummary) != "" { + j.Message = fmt.Sprintf("%s Move the card into %s and power-cycle it.", strings.TrimRight(result.VerificationSummary, "."), j.Node) + return + } + j.Message = fmt.Sprintf("Flash verified on %s. Move the card into %s and power-cycle it.", j.Host, j.Node) + }) +} + +func (a *App) flashArtifact(jobID, artifactRef string) (RemoteFlashResult, error) { + job := a.job(jobID) + if job == nil { + return RemoteFlashResult{}, fmt.Errorf("job %s no longer exists", jobID) + } nodes := clusterNodes() nodeMap := map[string]clusterNode{} for _, node := range nodes { nodeMap[node.Name] = node } - target, ok := nodeMap[a.job(jobID).Host] + target, ok := nodeMap[job.Host] if !ok { - return fmt.Errorf("flash host %s is not a current cluster node", a.job(jobID).Host) + return RemoteFlashResult{}, fmt.Errorf("flash host %s is not a current cluster node", job.Host) } image := a.podImageForArch(target.Arch) if image == "" { - return fmt.Errorf("no runner image configured for arch %s", target.Arch) + return RemoteFlashResult{}, fmt.Errorf("no runner image configured for arch %s", target.Arch) } a.setJob(jobID, func(j *Job) { - j.Stage = "flash" + j.Stage = "flash_pull" j.StageStartedAt = time.Now().UTC() - j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host) + j.Message = fmt.Sprintf("Pulling %s and preparing it for %s", artifactRef+":latest", prettyDeviceTarget(j.Device)) j.ProgressPct = 84 }) podName := fmt.Sprintf("metis-flash-%d", time.Now().UTC().UnixNano()) - logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, a.job(jobID).Node, a.job(jobID).Device, artifactRef)) + logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, job.Node, job.Device, artifactRef)) if err != nil { - return err + return RemoteFlashResult{}, err } - var payload map[string]any - if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &payload); err == nil { - a.setJob(jobID, func(j *Job) { - if dest, ok := payload["dest_path"].(string); ok && dest != "" { - j.Message = fmt.Sprintf("Wrote latest artifact to %s", dest) - } - }) + var result RemoteFlashResult + if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &result); err != nil { + return RemoteFlashResult{}, fmt.Errorf("decode remote flash output: %w: %s", err, strings.TrimSpace(logs)) } - return nil + if !result.Verified { + return RemoteFlashResult{}, fmt.Errorf("flash verification did not succeed for %s on %s", prettyDeviceTarget(job.Device), job.Host) + } + a.setJob(jobID, func(j *Job) { + if result.SizeBytes > 0 { + j.Written = result.SizeBytes + j.Total = result.SizeBytes + } + if strings.TrimSpace(result.VerificationSummary) != "" { + j.Message = result.VerificationSummary + return + } + if strings.TrimSpace(result.DestPath) != "" { + j.Message = fmt.Sprintf("Verified the latest image write at %s", result.DestPath) + } + }) + return result, nil } func (a *App) heartbeatRemoteJob(jobID string) { @@ -255,8 +324,8 @@ func (a *App) heartbeatRemoteJob(jobID string) { stageStart = j.StartedAt } elapsed := time.Since(stageStart) - switch j.Stage { - case "build": + switch { + case j.Stage == "build": progress, message := buildStageHeartbeat(j.Node, j.Builder, elapsed) if progress > j.ProgressPct { j.ProgressPct = progress @@ -264,14 +333,14 @@ func (a *App) heartbeatRemoteJob(jobID string) { if strings.TrimSpace(message) != "" { j.Message = message } - case "preflight": + case j.Stage == "preflight": if j.ProgressPct < 80 { j.ProgressPct = 80 } j.Message = fmt.Sprintf("Validating %s on %s and resolving the latest Harbor artifact", prettyDeviceTarget(j.Device), j.Host) - case "flash": - if j.Total > 0 && j.Written > 0 { - actual := 88 + (float64(j.Written)/float64(j.Total))*10 + case isFlashStage(j.Stage): + if j.Stage == "flash_write" && j.Total > 0 && j.Written > 0 { + actual := 92 + (float64(j.Written)/float64(j.Total))*6 if actual > 98 { actual = 98 } @@ -281,7 +350,7 @@ func (a *App) heartbeatRemoteJob(jobID string) { j.Message = fmt.Sprintf("Writing %s of %s on %s", humanBytes(j.Written), humanBytes(j.Total), j.Host) return } - progress, message := flashStageHeartbeat(j.Host, j.Artifact, elapsed) + progress, message := flashStagePhaseHeartbeat(j.Stage, j.Host, j.Artifact, elapsed) if progress > j.ProgressPct { j.ProgressPct = progress } diff --git a/pkg/service/remote_error_test.go b/pkg/service/remote_error_test.go index f3af837..bef73a9 100644 --- a/pkg/service/remote_error_test.go +++ b/pkg/service/remote_error_test.go @@ -29,7 +29,7 @@ func TestRemoteWorkflowErrorBranches(t *testing.T) { } job = app.newJob("flash", "titan-15", "titan-22", "/dev/sdz") - if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil { + if _, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil { t.Fatal("expected flashArtifact error") } @@ -260,7 +260,7 @@ func TestFlashArtifactAndHeartbeatBranches(t *testing.T) { app := remoteTestApp(t, nil) job := app.newJob("replace", "titan-15", "missing-host", "/dev/sdz") - if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") { + if _, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") { t.Fatalf("expected missing host flashArtifact error, got %v", err) } @@ -283,7 +283,7 @@ func TestFlashArtifactAndHeartbeatBranches(t *testing.T) { } app.setJob(job.ID, func(j *Job) { - j.Stage = "flash" + j.Stage = "flash_write" j.ProgressPct = 80 j.Written = 120 j.Total = 100 @@ -341,7 +341,7 @@ func remoteWorkflowKubeServer(t *testing.T, opts remoteKubeOptions) *httptest.Se buildPhase := defaultString(opts.buildPhase, "Succeeded") buildMessage := defaultString(opts.buildMessage, `{"local_path":"/workspace/build/titan-15.img.xz","compressed":true,"size_bytes":1234,"build_tag":"build-1"}`) flashPhase := defaultString(opts.flashPhase, "Succeeded") - flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img"}`) + flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /var/tmp/metis-flash-test/titan-15.img; boot and writable partitions are present."}`) nodes := opts.nodes if nodes == nil { nodes = []map[string]any{ diff --git a/pkg/service/remote_helpers.go b/pkg/service/remote_helpers.go index cea8d5a..f520f52 100644 --- a/pkg/service/remote_helpers.go +++ b/pkg/service/remote_helpers.go @@ -28,17 +28,36 @@ func buildStageHeartbeat(node, builder string, elapsed time.Duration) (float64, } func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64, string) { + return flashStagePhaseHeartbeat("flash", host, artifact, elapsed) +} + +func flashStagePhaseHeartbeat(stage, host, artifact string, elapsed time.Duration) (float64, string) { seconds := elapsed.Seconds() - switch { - case seconds < 10: - return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host) - case seconds < 45: - return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host) + switch strings.TrimSpace(stage) { + case "flash_pull": + return math.Min(88, ramp(seconds, 0, 12, 84, 88)), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host) + case "flash_prepare", "flash_unpack": + return math.Min(92, ramp(seconds, 0, 30, 88, 92)), fmt.Sprintf("Preparing the latest %s artifact for writing on %s", artifact, host) + case "flash_flush": + return math.Min(99, ramp(seconds, 0, 30, 98, 99)), fmt.Sprintf("Flushing the finished image write on %s", host) + case "flash_verify": + return math.Min(99.8, ramp(seconds, 0, 30, 99, 99.8)), fmt.Sprintf("Verifying the flashed recovery media on %s", host) default: - return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host) + switch { + case seconds < 10: + return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host) + case seconds < 45: + return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host) + default: + return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host) + } } } +func isFlashStage(stage string) bool { + return strings.HasPrefix(strings.TrimSpace(stage), "flash") +} + func prettyDeviceTarget(path string) string { switch { case strings.HasPrefix(path, "hosttmp://"): diff --git a/pkg/service/remote_helpers_test.go b/pkg/service/remote_helpers_test.go index 20149d4..f237d68 100644 --- a/pkg/service/remote_helpers_test.go +++ b/pkg/service/remote_helpers_test.go @@ -384,3 +384,31 @@ func TestSelectBuilderHostTieBreaksByName(t *testing.T) { t.Fatalf("expected alphabetical tie-breaker, got %s", node.Name) } } + +func TestFlashPhaseHeartbeatAndManagedPathHelpers(t *testing.T) { + cases := []struct { + stage string + minimum float64 + phrase string + }{ + {stage: "flash_pull", minimum: 84, phrase: "Pulling"}, + {stage: "flash_prepare", minimum: 88, phrase: "Preparing"}, + {stage: "flash_unpack", minimum: 90, phrase: "Preparing"}, + {stage: "flash_flush", minimum: 98, phrase: "Flushing"}, + {stage: "flash_verify", minimum: 99, phrase: "Verifying"}, + {stage: "flash", minimum: 90, phrase: "Writing"}, + } + for _, tc := range cases { + got, msg := flashStagePhaseHeartbeat(tc.stage, "titan-22", "registry.example/metis/titan-15:latest", 20*time.Second) + if got < tc.minimum || !strings.Contains(msg, tc.phrase) { + t.Fatalf("flashStagePhaseHeartbeat(%s) = %v %q", tc.stage, got, msg) + } + } + + if !managedPathsContain("/var/log/pods_/var/tmp", "/var/tmp") { + t.Fatal("expected managedPathsContain to match child paths") + } + if managedPathsContain("/var/log/pods", "/home/atlas") { + t.Fatal("managedPathsContain should reject unrelated paths") + } +} diff --git a/pkg/service/remote_status.go b/pkg/service/remote_status.go index 91aa742..f7eae73 100644 --- a/pkg/service/remote_status.go +++ b/pkg/service/remote_status.go @@ -19,6 +19,24 @@ type RemoteProgressUpdate struct { TotalBytes int64 `json:"total_bytes,omitempty"` } +// RemoteFlashResult captures the verified outcome of a remote flash worker run. +type RemoteFlashResult struct { + Node string `json:"node,omitempty"` + Device string `json:"device,omitempty"` + DestPath string `json:"dest_path,omitempty"` + SizeBytes int64 `json:"size_bytes,omitempty"` + Verified bool `json:"verified,omitempty"` + VerificationKind string `json:"verification_kind,omitempty"` + VerificationSummary string `json:"verification_summary,omitempty"` + BootPartition string `json:"boot_partition,omitempty"` + RootPartition string `json:"root_partition,omitempty"` + BootLabel string `json:"boot_label,omitempty"` + RootLabel string `json:"root_label,omitempty"` + BootFSType string `json:"boot_fstype,omitempty"` + RootFSType string `json:"root_fstype,omitempty"` + CheckedFiles []string `json:"checked_files,omitempty"` +} + // ProgressLogLine formats a progress update for remote worker stdout. func ProgressLogLine(update RemoteProgressUpdate) string { data, err := json.Marshal(update) diff --git a/pkg/service/server.go b/pkg/service/server.go index 6f603f6..7a29cfa 100644 --- a/pkg/service/server.go +++ b/pkg/service/server.go @@ -30,6 +30,7 @@ func (a *App) Handler() http.Handler { mux.HandleFunc("/api/state", a.withUIAuth(a.handleState)) mux.HandleFunc("/api/devices", a.withUIAuth(a.handleDevices)) mux.HandleFunc("/api/jobs/build", a.withUIAuth(a.handleBuild)) + mux.HandleFunc("/api/jobs/flash", a.withUIAuth(a.handleFlash)) mux.HandleFunc("/api/jobs/replace", a.withUIAuth(a.handleReplace)) mux.HandleFunc("/api/sentinel/watch", a.withUIAuth(a.handleWatch)) mux.HandleFunc("/", a.withUIAuth(a.handleIndex)) @@ -106,6 +107,23 @@ func (a *App) handleBuild(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusAccepted, job) } +func (a *App) handleFlash(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + values := requestValues(r) + node := values["node"] + host := values["host"] + device := values["device"] + job, err := a.Flash(node, host, device) + if err != nil { + http.Error(w, err.Error(), statusForJobError(err)) + return + } + writeJSON(w, http.StatusAccepted, job) +} + func (a *App) handleReplace(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) diff --git a/pkg/service/templates/metis.html b/pkg/service/templates/metis.html index 9e9049c..3734d68 100644 --- a/pkg/service/templates/metis.html +++ b/pkg/service/templates/metis.html @@ -174,7 +174,7 @@ } .actions{ display:grid; - grid-template-columns:repeat(3,minmax(0,1fr)); + grid-template-columns:repeat(2,minmax(0,1fr)); gap:.7rem; margin-top:.9rem; } @@ -277,7 +277,7 @@

Replacement Run

-

This UI is meant for the one-shot recovery path: build the node image, verify the card on the flash host, then write it and hand off only the physical swap.

+

Use the mode that matches the recovery step you are in: build a fresh image, flash the latest published image, or run the full build-and-flash path in one shot.

@@ -395,6 +396,22 @@ return seconds + 's'; } + function prettyLabel(value){ + return String(value || '') + .replaceAll('_', ' ') + .replace(/\b\w/g, (match)=>match.toUpperCase()); + } + + function prettyJobKind(kind){ + switch(kind){ + case 'build': return 'Build'; + case 'flash': return 'Flash Latest'; + case 'replace': return 'Build + Flash'; + case 'idle': return 'Idle'; + default: return prettyLabel(kind); + } + } + function banner(kind, title, text){ bannerEl.className = 'banner ' + kind; bannerTitleEl.textContent = title; @@ -445,10 +462,10 @@ const wrap = document.createElement('div'); wrap.className = 'item'; const statusClass = job.status === 'error' ? 'error' : (job.status === 'done' ? 'done' : (job.status === 'running' ? 'running' : '')); - const title = job.kind.toUpperCase() + (job.node ? ' · ' + job.node : ''); + const title = prettyJobKind(job.kind) + (job.node ? ' · ' + job.node : ''); const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : ''); const timingBits = []; - if(job.stage){ timingBits.push('stage: ' + job.stage); } + if(job.stage){ timingBits.push('stage: ' + prettyLabel(job.stage)); } const stageDuration = fmtDuration(job.stage_started_at || job.started_at, job.finished_at); if(stageDuration){ timingBits.push((job.status === 'running' ? 'stage elapsed ' : 'stage duration ') + stageDuration); @@ -538,7 +555,7 @@ } const selectedHost = hostSelect.value || state.default_flash_host; - hostNoteEl.textContent = 'Metis will inspect media and run the flash writer on ' + selectedHost + ' through a short-lived in-cluster worker. ' + state.default_flash_host + ' remains the default flash host.'; + hostNoteEl.textContent = 'Metis will inspect media and run the flash writer/verifier on ' + selectedHost + ' through a short-lived in-cluster worker. ' + state.default_flash_host + ' remains the default flash host.'; if(state.device_error){ deviceNoteEl.textContent = state.device_error; @@ -549,11 +566,13 @@ } const artifact = (state.artifacts || {})[nodeSelect.value]; - artifactNoteEl.textContent = artifact && artifact.ref - ? 'Latest published image: ' + artifact.ref + ' (Metis keeps the newest 3 builds in Harbor).' - : 'Successful build-only runs publish :latest into Harbor and keep the newest 3 builds per node.'; + const hasArtifact = !!(artifact && artifact.ref); + artifactNoteEl.textContent = hasArtifact + ? 'Latest published image: ' + artifact.ref + (artifact.build_tag ? ' · build ' + artifact.build_tag : '') + '. Flash latest writes this image directly; build and flash rebuilds first.' + : 'Build image only publishes :latest into Harbor. Flash latest stays disabled until Metis has a recorded latest image.'; document.getElementById('build-only').disabled = busy || !nodeSelect.value; + document.getElementById('flash-only').disabled = busy || !nodeSelect.value || !deviceSelect.value || !!state.device_error || !hasArtifact; document.getElementById('refresh-devices').disabled = busy; document.getElementById('replace-run').disabled = busy || !nodeSelect.value || !deviceSelect.value || !!state.device_error; document.getElementById('sentinel-watch').disabled = busy; @@ -654,6 +673,24 @@ }); }); + document.getElementById('flash-only').addEventListener('click', async ()=>{ + if(!requireValue(nodeSelect.value, 'Choose the target node whose latest published image should be flashed.')){ + return; + } + if(!requireValue(deviceSelect.value, 'Choose removable media before starting a flash-only run.')){ + return; + } + if(state.device_error){ + banner('error', 'Flash host unavailable', state.device_error); + return; + } + await runAction('Starting flash latest', 'Queueing a flash-only run from the latest published Harbor image.', async ()=>{ + await post('/api/jobs/flash', {node: nodeSelect.value, host: hostSelect.value, device: deviceSelect.value}); + await refreshState({silent:true}); + banner('success', 'Flash-only workflow queued', 'Metis is flashing the latest published image for ' + nodeSelect.value + ' onto ' + deviceSelect.value + ' and will verify it before finishing.'); + }); + }); + document.getElementById('replace-run').addEventListener('click', async ()=>{ if(!requireValue(nodeSelect.value, 'Choose the target node whose SD card image should be built and flashed.')){ return; @@ -668,7 +705,7 @@ await runAction('Starting build and flash', 'Queueing the full replacement workflow now.', async ()=>{ await post('/api/jobs/replace', {node: nodeSelect.value, host: hostSelect.value, device: deviceSelect.value}); await refreshState({silent:true}); - banner('success', 'Replacement workflow queued', 'Metis is building the image for ' + nodeSelect.value + ' and will flash ' + deviceSelect.value + '.'); + banner('success', 'Replacement workflow queued', 'Metis is rebuilding the image for ' + nodeSelect.value + ', flashing ' + deviceSelect.value + ', and verifying the media before finishing.'); }); }); diff --git a/pkg/service/workflow_test.go b/pkg/service/workflow_test.go index 1459688..17ee72a 100644 --- a/pkg/service/workflow_test.go +++ b/pkg/service/workflow_test.go @@ -185,7 +185,7 @@ func fakeKubeServer(t *testing.T) *httptest.Server { case strings.Contains(podName, "build"): message = `{"local_path":"/workspace/build/titan-15.img.xz","compressed":true,"size_bytes":1234,"build_tag":"build-1"}` case strings.Contains(podName, "flash"): - message = `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img"}` + message = `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /var/tmp/metis-flash-test/titan-15.img; boot and writable partitions are present."}` } _ = json.NewEncoder(w).Encode(map[string]any{ "metadata": map[string]any{"name": podName}, diff --git a/scripts/quality_gate.sh b/scripts/quality_gate.sh index 6fcec2e..1169534 100755 --- a/scripts/quality_gate.sh +++ b/scripts/quality_gate.sh @@ -57,4 +57,4 @@ if [ "${test_rc}" -ne 0 ]; then fi cd "${repo_root}/testing" -METIS_USE_EXISTING_COVERAGE=1 go test -v ./... +METIS_USE_EXISTING_COVERAGE=1 go test -count=1 -v ./... diff --git a/testing/coverage_policy.json b/testing/coverage_policy.json index 804c1d8..246fc9d 100644 --- a/testing/coverage_policy.json +++ b/testing/coverage_policy.json @@ -8,6 +8,7 @@ "metis/cmd/metis/inject_cmd.go": 95, "metis/cmd/metis/main.go": 95, "metis/cmd/metis/remote_cmd.go": 95, + "metis/cmd/metis/remote_flash.go": 95, "metis/cmd/metis/serve_cmd.go": 95, "metis/pkg/config/config.go": 95, "metis/pkg/facts/aggregate.go": 95,