diff --git a/cmd/metis/remote_cmd.go b/cmd/metis/remote_cmd.go
index f1f33b8..372ba57 100644
--- a/cmd/metis/remote_cmd.go
+++ b/cmd/metis/remote_cmd.go
@@ -163,76 +163,6 @@ func remoteBuildCmd(args []string) {
writeStructuredResult(summary)
}
-func remoteFlashCmd(args []string) {
- fs := flag.NewFlagSet("remote-flash", flag.ExitOnError)
- node := fs.String("node", "", "target node")
- device := fs.String("device", "", "target device path or test sink")
- artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag")
- workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory")
- harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host")
- harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username")
- harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password")
- hostTmpDir := fs.String("host-tmp-dir", "/host-tmp", "mounted host tmp dir for test writes")
- fs.Parse(args)
- if *node == "" || *device == "" || *artifactRef == "" {
- fatalf("--node, --device, and --artifact-ref are required")
- }
-
- if err := os.MkdirAll(*workDir, 0o755); err != nil {
- fatalf("mkdir workdir: %v", err)
- }
- emitStageProgress("flash", 84, fmt.Sprintf("Pulling the latest Harbor artifact for %s", *node))
- if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil {
- fatalf("oras login: %v", err)
- }
- if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil {
- fatalf("oras pull: %v", err)
- }
- emitStageProgress("flash", 88, fmt.Sprintf("Preparing the downloaded image for %s", *node))
- imagePath, compressed, err := resolvePulledArtifact(*workDir)
- if err != nil {
- fatalf("resolve artifact: %v", err)
- }
- rawImage := imagePath
- if compressed {
- emitStageProgress("flash", 90, fmt.Sprintf("Decompressing the image for %s before writing", *node))
- rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node))
- cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage))
- if out, err := cmd.CombinedOutput(); err != nil {
- fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out)))
- }
- }
-
- destPath := *device
- if strings.HasPrefix(destPath, "hosttmp://") {
- if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil {
- fatalf("mkdir host tmp dir: %v", err)
- }
- destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node))
- }
- emitStageProgress("flash", 92, fmt.Sprintf("Writing the latest image for %s to %s", *node, destPath))
- writeEmitter := newProgressEmitter("flash", 92, 98, fmt.Sprintf("Writing the latest image for %s", *node), true)
- if err := writer.WriteImageWithProgress(context.Background(), rawImage, destPath, writeEmitter); err != nil {
- fatalf("write image: %v", err)
- }
- emitStageProgress("flash", 99, fmt.Sprintf("Flushing the finished image for %s", *node))
- _ = exec.Command("sync").Run()
- if strings.HasPrefix(destPath, "/dev/") {
- _ = exec.Command("blockdev", "--flushbufs", destPath).Run()
- }
-
- info, err := os.Stat(destPath)
- if err != nil {
- fatalf("stat destination: %v", err)
- }
- writeStructuredResult(map[string]any{
- "node": *node,
- "device": *device,
- "dest_path": destPath,
- "size_bytes": info.Size(),
- })
-}
-
func writeStructuredResult(payload any) {
data, err := json.Marshal(payload)
if err != nil {
diff --git a/cmd/metis/remote_flash.go b/cmd/metis/remote_flash.go
new file mode 100644
index 0000000..9839a71
--- /dev/null
+++ b/cmd/metis/remote_flash.go
@@ -0,0 +1,304 @@
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "time"
+
+ "metis/pkg/service"
+ "metis/pkg/writer"
+)
+
+type flashPartitionTable struct {
+ PartitionTable struct {
+ Partitions []flashTablePartition `json:"partitions"`
+ } `json:"partitiontable"`
+}
+
+type flashTablePartition struct {
+ Type string `json:"type"`
+}
+
+type flashBlockDevicePayload struct {
+ Blockdevices []flashBlockDevice `json:"blockdevices"`
+}
+
+type flashBlockDevice struct {
+ Path string `json:"path"`
+ Type string `json:"type"`
+ FSType string `json:"fstype"`
+ Label string `json:"label"`
+ Children []flashBlockDevice `json:"children"`
+}
+
+var requiredBootFiles = []string{"config.txt", "cmdline.txt", "boot.scr"}
+
+func remoteFlashCmd(args []string) {
+ fs := flag.NewFlagSet("remote-flash", flag.ExitOnError)
+ node := fs.String("node", "", "target node")
+ device := fs.String("device", "", "target device path or test sink")
+ artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag")
+ workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory")
+ harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host")
+ harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username")
+ harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password")
+ hostTmpDir := fs.String("host-tmp-dir", "/host-tmp", "mounted host tmp dir for test writes")
+ fs.Parse(args)
+ if *node == "" || *device == "" || *artifactRef == "" {
+ fatalf("--node, --device, and --artifact-ref are required")
+ }
+
+ if err := os.MkdirAll(*workDir, 0o755); err != nil {
+ fatalf("mkdir workdir: %v", err)
+ }
+ emitStageProgress("flash_pull", 84, fmt.Sprintf("Pulling the latest Harbor artifact for %s", *node))
+ if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil {
+ fatalf("oras login: %v", err)
+ }
+ if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil {
+ fatalf("oras pull: %v", err)
+ }
+ emitStageProgress("flash_prepare", 88, fmt.Sprintf("Preparing the downloaded image for %s", *node))
+ imagePath, compressed, err := resolvePulledArtifact(*workDir)
+ if err != nil {
+ fatalf("resolve artifact: %v", err)
+ }
+ rawImage := imagePath
+ if compressed {
+ emitStageProgress("flash_unpack", 90, fmt.Sprintf("Decompressing the image for %s before writing", *node))
+ rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node))
+ cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage))
+ if out, err := cmd.CombinedOutput(); err != nil {
+ fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out)))
+ }
+ }
+
+ destPath := *device
+ if strings.HasPrefix(destPath, "hosttmp://") {
+ if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil {
+ fatalf("mkdir host tmp dir: %v", err)
+ }
+ destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node))
+ }
+ info, err := os.Stat(rawImage)
+ if err != nil {
+ fatalf("stat raw image: %v", err)
+ }
+ imageSize := info.Size()
+ emitStageProgress("flash_write", 92, fmt.Sprintf("Writing the latest image for %s to %s", *node, destPath))
+ writeEmitter := newProgressEmitter("flash_write", 92, 98, fmt.Sprintf("Writing the latest image for %s", *node), true)
+ if err := writer.WriteImageWithProgress(context.Background(), rawImage, destPath, writeEmitter); err != nil {
+ fatalf("write image: %v", err)
+ }
+ flushFlashTarget(destPath, *node)
+ result, err := verifyFlashDestination(destPath)
+ if err != nil {
+ fatalf("verify flash output: %v", err)
+ }
+ result.Node = *node
+ result.Device = *device
+ result.DestPath = destPath
+ result.SizeBytes = imageSize
+ writeStructuredResult(result)
+}
+
+func flushFlashTarget(destPath, node string) {
+ emitStageProgress("flash_flush", 98.5, fmt.Sprintf("Flushing the finished image for %s", node))
+ _ = exec.Command("sync").Run()
+ if !strings.HasPrefix(destPath, "/dev/") {
+ return
+ }
+ _ = exec.Command("blockdev", "--flushbufs", destPath).Run()
+ _ = exec.Command("blockdev", "--rereadpt", destPath).Run()
+ _ = exec.Command("partprobe", destPath).Run()
+ _ = exec.Command("udevadm", "settle", "--timeout=10").Run()
+}
+
+func verifyFlashDestination(destPath string) (service.RemoteFlashResult, error) {
+ emitStageProgress("flash_verify", 99.2, fmt.Sprintf("Verifying the flashed recovery media at %s", destPath))
+ if strings.HasPrefix(destPath, "/dev/") {
+ return verifyBlockDeviceFlash(destPath)
+ }
+ return verifyImageFileFlash(destPath)
+}
+
+func verifyImageFileFlash(destPath string) (service.RemoteFlashResult, error) {
+ table, err := readFlashPartitionTable(destPath)
+ if err != nil {
+ return service.RemoteFlashResult{}, err
+ }
+ hasBoot := false
+ hasRoot := false
+ for _, part := range table.PartitionTable.Partitions {
+ if isBootPartitionType(part.Type) {
+ hasBoot = true
+ }
+ if isLinuxPartitionType(part.Type) {
+ hasRoot = true
+ }
+ }
+ if !hasBoot || !hasRoot {
+ return service.RemoteFlashResult{}, fmt.Errorf("image %s does not expose the expected boot and writable partitions", destPath)
+ }
+ return service.RemoteFlashResult{
+ Verified: true,
+ VerificationKind: "image-file",
+ VerificationSummary: fmt.Sprintf("Verified image layout at %s; boot and writable partitions are present.", destPath),
+ }, nil
+}
+
+func verifyBlockDeviceFlash(destPath string) (service.RemoteFlashResult, error) {
+ deadline := time.Now().Add(15 * time.Second)
+ var lastErr error
+ for time.Now().Before(deadline) {
+ parts, err := readBlockDevicePartitions(destPath)
+ if err == nil {
+ boot, root, classifyErr := classifyFlashPartitions(parts)
+ if classifyErr == nil {
+ checkedFiles, verifyErr := verifyBootPartitionFiles(boot.Path)
+ if verifyErr == nil {
+ bootLabel := firstNonEmpty(boot.Label, filepath.Base(boot.Path))
+ rootLabel := firstNonEmpty(root.Label, filepath.Base(root.Path))
+ return service.RemoteFlashResult{
+ Verified: true,
+ VerificationKind: "block-device",
+ VerificationSummary: fmt.Sprintf("Verified %s; %s and %s are present and the boot files look correct.", destPath, bootLabel, rootLabel),
+ BootPartition: boot.Path,
+ RootPartition: root.Path,
+ BootLabel: boot.Label,
+ RootLabel: root.Label,
+ BootFSType: boot.FSType,
+ RootFSType: root.FSType,
+ CheckedFiles: checkedFiles,
+ }, nil
+ }
+ lastErr = verifyErr
+ } else {
+ lastErr = classifyErr
+ }
+ } else {
+ lastErr = err
+ }
+ _ = exec.Command("blockdev", "--rereadpt", destPath).Run()
+ _ = exec.Command("partprobe", destPath).Run()
+ _ = exec.Command("udevadm", "settle", "--timeout=10").Run()
+ time.Sleep(time.Second)
+ }
+ if lastErr == nil {
+ lastErr = fmt.Errorf("timed out waiting for the flashed partitions on %s", destPath)
+ }
+ return service.RemoteFlashResult{}, lastErr
+}
+
+func readFlashPartitionTable(destPath string) (flashPartitionTable, error) {
+ out, err := exec.Command("sfdisk", "-J", destPath).CombinedOutput()
+ if err != nil {
+ return flashPartitionTable{}, fmt.Errorf("sfdisk -J %s: %v: %s", destPath, err, strings.TrimSpace(string(out)))
+ }
+ var table flashPartitionTable
+ if err := json.Unmarshal(out, &table); err != nil {
+ return flashPartitionTable{}, fmt.Errorf("decode partition table for %s: %w", destPath, err)
+ }
+ return table, nil
+}
+
+func readBlockDevicePartitions(destPath string) ([]flashBlockDevice, error) {
+ out, err := exec.Command("lsblk", "-J", "-o", "PATH,TYPE,FSTYPE,LABEL", destPath).CombinedOutput()
+ if err != nil {
+ return nil, fmt.Errorf("lsblk %s: %v: %s", destPath, err, strings.TrimSpace(string(out)))
+ }
+ var payload flashBlockDevicePayload
+ if err := json.Unmarshal(out, &payload); err != nil {
+ return nil, fmt.Errorf("decode lsblk output for %s: %w", destPath, err)
+ }
+ for _, device := range payload.Blockdevices {
+ if device.Path == destPath {
+ return device.Children, nil
+ }
+ }
+ if len(payload.Blockdevices) == 1 {
+ return payload.Blockdevices[0].Children, nil
+ }
+ return nil, fmt.Errorf("lsblk did not report partitions for %s", destPath)
+}
+
+func classifyFlashPartitions(parts []flashBlockDevice) (flashBlockDevice, flashBlockDevice, error) {
+ var boot flashBlockDevice
+ var root flashBlockDevice
+ for _, part := range parts {
+ if part.Type != "part" {
+ continue
+ }
+ normalizedLabel := strings.ToLower(strings.TrimSpace(part.Label))
+ normalizedFS := strings.ToLower(strings.TrimSpace(part.FSType))
+ if boot.Path == "" && (normalizedLabel == "system-boot" || normalizedFS == "vfat" || normalizedFS == "fat" || normalizedFS == "fat32") {
+ boot = part
+ continue
+ }
+ if root.Path == "" && (normalizedLabel == "writable" || normalizedFS == "ext4") {
+ root = part
+ }
+ }
+ if boot.Path == "" {
+ return flashBlockDevice{}, flashBlockDevice{}, fmt.Errorf("could not find a boot partition with a FAT filesystem")
+ }
+ if root.Path == "" {
+ return flashBlockDevice{}, flashBlockDevice{}, fmt.Errorf("could not find a writable ext4 partition")
+ }
+ return boot, root, nil
+}
+
+func verifyBootPartitionFiles(partitionPath string) ([]string, error) {
+ mountDir, err := os.MkdirTemp("", "metis-boot-")
+ if err != nil {
+ return nil, err
+ }
+ defer os.RemoveAll(mountDir)
+ cmd := exec.Command("mount", "-o", "ro", partitionPath, mountDir)
+ if out, err := cmd.CombinedOutput(); err != nil {
+ return nil, fmt.Errorf("mount %s read-only: %v: %s", partitionPath, err, strings.TrimSpace(string(out)))
+ }
+ defer exec.Command("umount", mountDir).Run()
+
+ checked := make([]string, 0, len(requiredBootFiles))
+ for _, name := range requiredBootFiles {
+ if _, err := os.Stat(filepath.Join(mountDir, name)); err != nil {
+ return nil, fmt.Errorf("boot partition %s is missing %s", partitionPath, name)
+ }
+ checked = append(checked, name)
+ }
+ return checked, nil
+}
+
+func isBootPartitionType(partType string) bool {
+ normalized := strings.ToLower(strings.TrimSpace(partType))
+ switch normalized {
+ case "b", "c", "0b", "0c", "ef", "ef00":
+ return true
+ }
+ return normalized == "c12a7328-f81f-11d2-ba4b-00a0c93ec93b"
+}
+
+func isLinuxPartitionType(partType string) bool {
+ normalized := strings.ToLower(strings.TrimSpace(partType))
+ switch normalized {
+ case "83", "8300":
+ return true
+ }
+ return normalized == "0fc63daf-8483-4772-8e79-3d69d8477de4"
+}
+
+func firstNonEmpty(values ...string) string {
+ for _, value := range values {
+ if trimmed := strings.TrimSpace(value); trimmed != "" {
+ return trimmed
+ }
+ }
+ return ""
+}
diff --git a/cmd/metis/remote_flash_test.go b/cmd/metis/remote_flash_test.go
new file mode 100644
index 0000000..3514470
--- /dev/null
+++ b/cmd/metis/remote_flash_test.go
@@ -0,0 +1,246 @@
+package main
+
+import (
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+)
+
+func TestVerifyFlashDestinationForImageFilesAndBlockDevices(t *testing.T) {
+ t.Run("image file success and helpers", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "sfdisk": `cat <<'JSON'
+{"partitiontable":{"partitions":[{"type":"ef00"},{"type":"8300"}]}}
+JSON`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ result, err := verifyFlashDestination(filepath.Join(t.TempDir(), "titan-15.img"))
+ if err != nil {
+ t.Fatalf("verifyFlashDestination(image): %v", err)
+ }
+ if !result.Verified || result.VerificationKind != "image-file" {
+ t.Fatalf("unexpected image verification result: %#v", result)
+ }
+ if !isBootPartitionType("ef00") || !isLinuxPartitionType("8300") {
+ t.Fatal("expected helper partition type recognizers to accept GPT aliases")
+ }
+ if got := firstNonEmpty("", " writable ", "ignored"); got != "writable" {
+ t.Fatalf("firstNonEmpty = %q", got)
+ }
+ })
+
+ t.Run("image file missing writable partition", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "sfdisk": `cat <<'JSON'
+{"partitiontable":{"partitions":[{"type":"ef00"}]}}
+JSON`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ if _, err := verifyImageFileFlash(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "boot and writable partitions") {
+ t.Fatalf("expected missing partition error, got %v", err)
+ }
+ })
+
+ t.Run("image file missing boot partition", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "sfdisk": `cat <<'JSON'
+{"partitiontable":{"partitions":[{"type":"8300"}]}}
+JSON`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ if _, err := verifyImageFileFlash(filepath.Join(t.TempDir(), "broken-boot.img")); err == nil || !strings.Contains(err.Error(), "boot and writable partitions") {
+ t.Fatalf("expected missing boot partition error, got %v", err)
+ }
+ })
+
+ t.Run("block device success", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "lsblk": `cat <<'JSON'
+{"blockdevices":[{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"},{"path":"/dev/sdk2","type":"part","fstype":"ext4","label":"writable"}]}]}
+JSON`,
+ "mount": `mount_dir="${4:-}"
+mkdir -p "${mount_dir}"
+printf 'ok' > "${mount_dir}/config.txt"
+printf 'console=ttyAMA0' > "${mount_dir}/cmdline.txt"
+printf 'boot script' > "${mount_dir}/boot.scr"`,
+ "umount": `exit 0`,
+ "blockdev": `exit 0`,
+ "partprobe": `exit 0`,
+ "udevadm": `exit 0`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ result, err := verifyFlashDestination("/dev/sdk")
+ if err != nil {
+ t.Fatalf("verifyFlashDestination(block): %v", err)
+ }
+ if !result.Verified || result.VerificationKind != "block-device" {
+ t.Fatalf("unexpected block-device verification result: %#v", result)
+ }
+ if result.BootPartition != "/dev/sdk1" || result.RootPartition != "/dev/sdk2" {
+ t.Fatalf("unexpected partition details: %#v", result)
+ }
+ if len(result.CheckedFiles) != 3 {
+ t.Fatalf("expected checked boot files, got %#v", result.CheckedFiles)
+ }
+ })
+
+ t.Run("block device missing boot file", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "lsblk": `cat <<'JSON'
+{"blockdevices":[{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"},{"path":"/dev/sdk2","type":"part","fstype":"ext4","label":"writable"}]}]}
+JSON`,
+ "mount": `mount_dir="${4:-}"
+mkdir -p "${mount_dir}"
+printf 'ok' > "${mount_dir}/config.txt"
+printf 'console=ttyAMA0' > "${mount_dir}/cmdline.txt"`,
+ "umount": `exit 0`,
+ "blockdev": `exit 0`,
+ "partprobe": `exit 0`,
+ "udevadm": `exit 0`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ if _, err := verifyBlockDeviceFlash("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "missing boot.scr") {
+ t.Fatalf("expected missing boot file error, got %v", err)
+ }
+ })
+}
+
+func TestFlushFlashTargetAndClassifyHelpers(t *testing.T) {
+ logPath := filepath.Join(t.TempDir(), "commands.log")
+ tools := fakeCommandDir(t, map[string]string{
+ "sync": `printf 'sync %s\n' "$*" >> "` + logPath + `"`,
+ "blockdev": `printf 'blockdev %s\n' "$*" >> "` + logPath + `"`,
+ "partprobe": `printf 'partprobe %s\n' "$*" >> "` + logPath + `"`,
+ "udevadm": `printf 'udevadm %s\n' "$*" >> "` + logPath + `"`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+
+ flushFlashTarget("/tmp/test.img", "titan-15")
+ flushFlashTarget("/dev/sdk", "titan-15")
+
+ logged, err := os.ReadFile(logPath)
+ if err != nil {
+ t.Fatalf("read flush log: %v", err)
+ }
+ text := string(logged)
+ if strings.Count(text, "sync ") != 2 {
+ t.Fatalf("expected sync for file and block targets, got %q", text)
+ }
+ for _, want := range []string{"blockdev --flushbufs /dev/sdk", "blockdev --rereadpt /dev/sdk", "partprobe /dev/sdk", "udevadm settle --timeout=10"} {
+ if !strings.Contains(text, want) {
+ t.Fatalf("expected %q in %q", want, text)
+ }
+ }
+
+ if _, _, err := classifyFlashPartitions([]flashBlockDevice{{Path: "/dev/sdk1", Type: "part", FSType: "vfat", Label: "system-boot"}}); err == nil || !strings.Contains(err.Error(), "writable ext4") {
+ t.Fatalf("expected classify failure, got %v", err)
+ }
+}
+
+func TestReadFlashPartitionTableAndBlockPartitionErrors(t *testing.T) {
+ t.Run("partition table decode error", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{"sfdisk": `printf '{'`})
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ if _, err := readFlashPartitionTable(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "decode partition table") {
+ t.Fatalf("expected partition table decode error, got %v", err)
+ }
+ })
+
+ t.Run("lsblk decode error", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{"lsblk": `printf '{'`})
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "decode lsblk output") {
+ t.Fatalf("expected lsblk decode error, got %v", err)
+ }
+ })
+
+ t.Run("lsblk finds explicit device match", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "lsblk": `cat <<'JSON'
+{"blockdevices":[{"path":"/dev/other","children":[{"path":"/dev/other1","type":"part"}]},{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"}]}]}
+JSON`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ parts, err := readBlockDevicePartitions("/dev/sdk")
+ if err != nil {
+ t.Fatalf("readBlockDevicePartitions match: %v", err)
+ }
+ if len(parts) != 1 || parts[0].Path != "/dev/sdk1" {
+ t.Fatalf("unexpected matching partitions: %#v", parts)
+ }
+ })
+
+ t.Run("lsblk missing device match", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "lsblk": `cat <<'JSON'
+{"blockdevices":[{"path":"/dev/other","children":[]},{"path":"/dev/another","children":[]}]}
+JSON`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "did not report partitions") {
+ t.Fatalf("expected missing block-device match error, got %v", err)
+ }
+ })
+}
+
+func TestVerifyBootPartitionFilesMountFailure(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "mount": `printf 'mount failed' >&2; exit 9`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ if _, err := verifyBootPartitionFiles("/dev/sdk1"); err == nil || !strings.Contains(err.Error(), "mount /dev/sdk1 read-only") {
+ t.Fatalf("expected mount failure, got %v", err)
+ }
+}
+
+func TestRemoteFlashHelperAdditionalFailureBranches(t *testing.T) {
+ t.Run("partition table command failure", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{"sfdisk": `printf 'bad disk' >&2; exit 7`})
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ if _, err := readFlashPartitionTable(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "sfdisk -J") {
+ t.Fatalf("expected sfdisk failure, got %v", err)
+ }
+ })
+
+ t.Run("lsblk command failure", func(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{"lsblk": `printf 'lsblk failed' >&2; exit 4`})
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "lsblk /dev/sdk") {
+ t.Fatalf("expected lsblk command failure, got %v", err)
+ }
+ })
+
+ t.Run("boot partition classification failure", func(t *testing.T) {
+ if _, _, err := classifyFlashPartitions([]flashBlockDevice{{Path: "/dev/sdk2", Type: "part", FSType: "ext4", Label: "writable"}}); err == nil || !strings.Contains(err.Error(), "boot partition") {
+ t.Fatalf("expected missing boot partition error, got %v", err)
+ }
+ })
+
+ t.Run("firstNonEmpty returns empty when every value is blank", func(t *testing.T) {
+ if got := firstNonEmpty("", " "); got != "" {
+ t.Fatalf("expected empty firstNonEmpty result, got %q", got)
+ }
+ })
+}
+
+func TestReadBlockDevicePartitionsFallsBackToSingleReturnedDevice(t *testing.T) {
+ tools := fakeCommandDir(t, map[string]string{
+ "lsblk": `cat <<'JSON'
+{"blockdevices":[{"path":"/dev/mmcblk0","children":[{"path":"/dev/mmcblk0p1","type":"part","fstype":"vfat","label":"system-boot"}]}]}
+JSON`,
+ })
+ t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
+ parts, err := readBlockDevicePartitions("/dev/sdk")
+ if err != nil {
+ t.Fatalf("readBlockDevicePartitions fallback: %v", err)
+ }
+ if len(parts) != 1 || parts[0].Path != "/dev/mmcblk0p1" {
+ t.Fatalf("unexpected fallback partitions: %#v", parts)
+ }
+}
diff --git a/pkg/service/app.go b/pkg/service/app.go
index a7bed80..c5c6652 100644
--- a/pkg/service/app.go
+++ b/pkg/service/app.go
@@ -221,6 +221,31 @@ func (a *App) Build(node string) (*Job, error) {
return job, nil
}
+// Flash starts a background flash-only workflow for the latest published node image.
+func (a *App) Flash(node, host, device string) (*Job, error) {
+ if host == "" {
+ host = a.settings.DefaultFlashHost
+ }
+ if err := a.ensureReplacementReady(node); err != nil {
+ return nil, err
+ }
+ if active := a.activeJobForNode(node); active != nil {
+ return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
+ }
+ if summary, ok := a.artifacts()[node]; !ok || strings.TrimSpace(summary.Ref) == "" {
+ return nil, fmt.Errorf("no published image recorded for %s yet; run a build first", node)
+ }
+ if _, err := a.ensureDevice(host, device); err != nil {
+ return nil, err
+ }
+ job, err := a.reserveJob("flash", node, host, device)
+ if err != nil {
+ return nil, err
+ }
+ go a.runFlash(job)
+ return job, nil
+}
+
// Replace starts a background build+flash workflow for a node.
func (a *App) Replace(node, host, device string) (*Job, error) {
if host == "" {
diff --git a/pkg/service/app_helpers.go b/pkg/service/app_helpers.go
index be6f8c3..38ed3cb 100644
--- a/pkg/service/app_helpers.go
+++ b/pkg/service/app_helpers.go
@@ -68,7 +68,7 @@ func (a *App) activeJobForNodeLocked(node string) *Job {
continue
}
switch job.Kind {
- case "build", "replace":
+ case "build", "flash", "replace":
default:
continue
}
diff --git a/pkg/service/coverage_more_test.go b/pkg/service/coverage_more_test.go
index 92a8120..c0fc9e3 100644
--- a/pkg/service/coverage_more_test.go
+++ b/pkg/service/coverage_more_test.go
@@ -407,7 +407,7 @@ func TestServiceClusterAndRemotePodBranches(t *testing.T) {
"metadata": map[string]any{"name": filepath.Base(r.URL.Path)},
"status": map[string]any{
"phase": "Succeeded",
- "message": `{"dest_path":"/tmp/out.img"}`,
+ "message": `{"dest_path":"/tmp/out.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /tmp/out.img; boot and writable partitions are present."}`,
"reason": "Completed",
},
})
diff --git a/pkg/service/flash_workflow_test.go b/pkg/service/flash_workflow_test.go
new file mode 100644
index 0000000..bd19a6e
--- /dev/null
+++ b/pkg/service/flash_workflow_test.go
@@ -0,0 +1,182 @@
+package service
+
+import (
+ "encoding/json"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestFlashOnlyWorkflowRequiresPublishedArtifact(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+
+ if _, err := app.Flash("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "run a build first") {
+ t.Fatalf("expected missing artifact error, got %v", err)
+ }
+}
+
+func TestFlashOnlyWorkflowCompletesAndRecordsVerification(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","verified":true,"verification_kind":"block-device","verification_summary":"Verified /dev/sdz; system-boot and writable are present and the boot files look correct.","boot_partition":"/dev/sdz1","root_partition":"/dev/sdz2","boot_label":"system-boot","root_label":"writable","boot_fstype":"vfat","root_fstype":"ext4","checked_files":["config.txt","cmdline.txt","boot.scr"]}`})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+ app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
+
+ job, err := app.Flash("titan-15", "titan-22", "/dev/sdz")
+ if err != nil {
+ t.Fatalf("Flash: %v", err)
+ }
+ waitForJobState(t, app, job.ID, JobDone)
+
+ got := app.job(job.ID)
+ if got == nil || got.Kind != "flash" || got.Status != JobDone {
+ t.Fatalf("unexpected flash job: %#v", got)
+ }
+ if !strings.Contains(got.Message, "Move the card into titan-15") {
+ t.Fatalf("unexpected completion message: %#v", got)
+ }
+ if got.ProgressPct != 100 {
+ t.Fatalf("expected complete flash progress, got %#v", got)
+ }
+
+ events := app.recentEvents(5)
+ if len(events) == 0 || events[0].Kind != "image.flash" {
+ t.Fatalf("expected flash event, got %#v", events)
+ }
+ if verified, ok := events[0].Details["verified"].(bool); !ok || !verified {
+ t.Fatalf("expected verified flash details, got %#v", events[0].Details)
+ }
+ if kind, ok := events[0].Details["verification_kind"].(string); !ok || kind != "block-device" {
+ t.Fatalf("expected block-device verification, got %#v", events[0].Details)
+ }
+ if bootPartition, ok := events[0].Details["boot_partition"].(string); !ok || bootPartition != "/dev/sdz1" {
+ t.Fatalf("expected boot partition details, got %#v", events[0].Details)
+ }
+}
+
+func TestHandleFlashRouteStartsFlashOnlyJob(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+ app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
+ handler := app.Handler()
+
+ req := httptest.NewRequest(http.MethodPost, "/api/jobs/flash", strings.NewReader(`{"node":"titan-15","host":"titan-22","device":"/dev/sdz"}`))
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("X-Auth-Request-User", "brad")
+ req.Header.Set("X-Auth-Request-Groups", "admin")
+ resp := httptest.NewRecorder()
+ handler.ServeHTTP(resp, req)
+ if resp.Code != http.StatusAccepted {
+ t.Fatalf("flash route response: %d %s", resp.Code, resp.Body.String())
+ }
+
+ var job Job
+ if err := json.Unmarshal(resp.Body.Bytes(), &job); err != nil {
+ t.Fatalf("decode flash job: %v", err)
+ }
+ waitForJobState(t, app, job.ID, JobDone)
+ if got := app.job(job.ID); got == nil || got.Kind != "flash" {
+ t.Fatalf("expected flash job, got %#v", got)
+ }
+}
+
+func TestFlashAndReplaceDefaultHostBranches(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
+ harbor := fakeHarborServer(t, true)
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, harbor)
+ app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
+
+ flashJob, err := app.Flash("titan-15", "", hostTmpDevicePath)
+ if err != nil {
+ t.Fatalf("Flash default host: %v", err)
+ }
+ waitForJobState(t, app, flashJob.ID, JobDone)
+ if got := app.job(flashJob.ID); got == nil || got.Host != "titan-22" || !strings.Contains(got.Message, "host /tmp") {
+ t.Fatalf("unexpected flash default host job: %#v", got)
+ }
+
+ replaceJob, err := app.Replace("titan-15", "", hostTmpDevicePath)
+ if err != nil {
+ t.Fatalf("Replace default host: %v", err)
+ }
+ waitForJobState(t, app, replaceJob.ID, JobDone)
+ if got := app.job(replaceJob.ID); got == nil || got.Host != "titan-22" {
+ t.Fatalf("unexpected replace default host job: %#v", got)
+ }
+}
+
+func TestFlashRejectsDuplicateActiveNodeJob(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+ app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
+ app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
+
+ if _, err := app.Flash("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "already has an active") {
+ t.Fatalf("expected flash conflict, got %v", err)
+ }
+}
+
+func TestFlashArtifactUpdatesProgressWhenVerificationSummaryIsMissing(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","size_bytes":4096,"verified":true}`})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+ job := app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
+
+ result, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15")
+ if err != nil {
+ t.Fatalf("flashArtifact: %v", err)
+ }
+ if result.SizeBytes != 4096 {
+ t.Fatalf("unexpected flash result: %#v", result)
+ }
+ got := app.job(job.ID)
+ if got == nil || got.Written != 4096 || got.Total != 4096 || !strings.Contains(got.Message, "Verified the latest image write") {
+ t.Fatalf("unexpected flash artifact state: %#v", got)
+ }
+}
+
+func TestFlashOnlyWorkflowFailsVerification(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","verified":false}`})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+ app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
+
+ job, err := app.Flash("titan-15", "titan-22", "/dev/sdz")
+ if err != nil {
+ t.Fatalf("Flash: %v", err)
+ }
+ deadline := time.Now().Add(5 * time.Second)
+ for time.Now().Before(deadline) {
+ if got := app.job(job.ID); got != nil && got.Status == JobError {
+ if !strings.Contains(got.Error, "verification did not succeed") {
+ t.Fatalf("unexpected flash verification error: %#v", got)
+ }
+ return
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ t.Fatalf("flash job %s never failed verification: %#v", job.ID, app.job(job.ID))
+}
+
+func TestBuildFlashAndReplaceRejectInvalidInputs(t *testing.T) {
+ kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
+ installKubeFactory(t, kube)
+ app := remoteTestApp(t, nil)
+ app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
+
+ if _, err := app.Build("missing-node"); err == nil || !strings.Contains(err.Error(), "missing-node") {
+ t.Fatalf("expected build inventory error, got %v", err)
+ }
+ if _, err := app.Flash("titan-15", "titan-22", "/dev/sda"); err == nil || !strings.Contains(err.Error(), "not a current flash candidate") {
+ t.Fatalf("expected flash device validation error, got %v", err)
+ }
+ if _, err := app.Replace("titan-15", "titan-22", "/dev/sda"); err == nil || !strings.Contains(err.Error(), "not a current flash candidate") {
+ t.Fatalf("expected replace device validation error, got %v", err)
+ }
+}
diff --git a/pkg/service/remote.go b/pkg/service/remote.go
index b7399d9..a1b29a0 100644
--- a/pkg/service/remote.go
+++ b/pkg/service/remote.go
@@ -74,7 +74,7 @@ func (a *App) RefreshDevices(host string) ([]Device, error) {
}
func (a *App) runBuild(job *Job, flash bool) {
- nodeSpec, class, err := a.inventory.FindNode(job.Node)
+ _, class, err := a.inventory.FindNode(job.Node)
if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
@@ -158,18 +158,42 @@ func (a *App) runBuild(job *Job, flash bool) {
return
}
- a.setJob(job.ID, func(j *Job) {
- j.Stage = "preflight"
- j.StageStartedAt = time.Now().UTC()
- j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host)
- j.ProgressPct = 78
- j.Artifact = artifactRef + ":latest"
- })
- if _, err := a.ensureDevice(job.Host, job.Device); err != nil {
+ result, err := a.runFlashSequence(job, artifactRef)
+ if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordFlash(job.Node, job.Host, "error")
return
}
+ a.metrics.RecordFlash(job.Node, job.Host, "ok")
+ a.appendFlashEvent(job, artifactRef, result)
+ a.completeFlashJob(job.ID, artifactRef, result)
+}
+
+func (a *App) runFlash(job *Job) {
+ artifactRef := a.artifactRepo(job.Node)
+ result, err := a.runFlashSequence(job, artifactRef)
+ if err != nil {
+ a.failJob(job.ID, err)
+ a.metrics.RecordFlash(job.Node, job.Host, "error")
+ return
+ }
+ a.metrics.RecordFlash(job.Node, job.Host, "ok")
+ a.appendFlashEvent(job, artifactRef, result)
+ a.completeFlashJob(job.ID, artifactRef, result)
+}
+
+func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) {
+ a.setJob(job.ID, func(j *Job) {
+ j.Status = JobRunning
+ j.Stage = "preflight"
+ j.StageStartedAt = time.Now().UTC()
+ j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host)
+ j.ProgressPct = 80
+ j.Artifact = artifactRef + ":latest"
+ })
+ if _, err := a.ensureDevice(job.Host, job.Device); err != nil {
+ return RemoteFlashResult{}, err
+ }
if !strings.HasPrefix(job.Device, "hosttmp://") {
if err := deleteNodeObject(job.Node); err != nil {
a.appendEvent(Event{
@@ -180,66 +204,111 @@ func (a *App) runBuild(job *Job, flash bool) {
})
}
}
- if err := a.flashArtifact(job.ID, artifactRef); err != nil {
- a.failJob(job.ID, err)
- a.metrics.RecordFlash(job.Node, job.Host, "error")
- return
+ return a.flashArtifact(job.ID, artifactRef)
+}
+
+func (a *App) appendFlashEvent(job *Job, artifactRef string, result RemoteFlashResult) {
+ summary := fmt.Sprintf("Flashed and verified %s latest image on %s", job.Node, job.Host)
+ if strings.HasPrefix(job.Device, "hosttmp://") {
+ summary = fmt.Sprintf("Verified %s latest image on %s host scratch", job.Node, job.Host)
+ }
+ details := map[string]any{
+ "node": job.Node,
+ "device": job.Device,
+ "host": job.Host,
+ "artifact": artifactRef + ":latest",
+ "dest_path": result.DestPath,
+ "verified": result.Verified,
+ "verification_kind": result.VerificationKind,
+ }
+ if strings.TrimSpace(result.BootPartition) != "" {
+ details["boot_partition"] = result.BootPartition
+ }
+ if strings.TrimSpace(result.RootPartition) != "" {
+ details["root_partition"] = result.RootPartition
+ }
+ if len(result.CheckedFiles) > 0 {
+ details["checked_files"] = append([]string{}, result.CheckedFiles...)
}
- a.metrics.RecordFlash(job.Node, job.Host, "ok")
a.appendEvent(Event{
Time: time.Now().UTC(),
Kind: "image.flash",
- Summary: fmt.Sprintf("Flashed %s latest image on %s", job.Node, job.Host),
- Details: map[string]any{"node": job.Node, "device": job.Device, "host": job.Host, "artifact": artifactRef + ":latest"},
+ Summary: summary,
+ Details: details,
})
- a.completeJob(job.ID, func(j *Job) {
- j.Stage = "complete"
- if strings.HasPrefix(j.Device, "hosttmp://") {
- j.Message = fmt.Sprintf("Test flash complete on %s host /tmp.", j.Host)
- } else {
- j.Message = fmt.Sprintf("Flash complete on %s. Move the card into %s and power-cycle it.", j.Host, j.Node)
- }
- j.ProgressPct = 100
- j.Artifact = artifactRef + ":latest"
- })
-
- _ = nodeSpec
}
-func (a *App) flashArtifact(jobID, artifactRef string) error {
+func (a *App) completeFlashJob(jobID, artifactRef string, result RemoteFlashResult) {
+ a.completeJob(jobID, func(j *Job) {
+ j.Stage = "complete"
+ j.ProgressPct = 100
+ j.Artifact = artifactRef + ":latest"
+ if strings.HasPrefix(j.Device, "hosttmp://") {
+ if strings.TrimSpace(result.VerificationSummary) != "" {
+ j.Message = fmt.Sprintf("%s Test flash complete on %s host /tmp.", strings.TrimRight(result.VerificationSummary, "."), j.Host)
+ return
+ }
+ j.Message = fmt.Sprintf("Verified test flash on %s host /tmp.", j.Host)
+ return
+ }
+ if strings.TrimSpace(result.VerificationSummary) != "" {
+ j.Message = fmt.Sprintf("%s Move the card into %s and power-cycle it.", strings.TrimRight(result.VerificationSummary, "."), j.Node)
+ return
+ }
+ j.Message = fmt.Sprintf("Flash verified on %s. Move the card into %s and power-cycle it.", j.Host, j.Node)
+ })
+}
+
+func (a *App) flashArtifact(jobID, artifactRef string) (RemoteFlashResult, error) {
+ job := a.job(jobID)
+ if job == nil {
+ return RemoteFlashResult{}, fmt.Errorf("job %s no longer exists", jobID)
+ }
nodes := clusterNodes()
nodeMap := map[string]clusterNode{}
for _, node := range nodes {
nodeMap[node.Name] = node
}
- target, ok := nodeMap[a.job(jobID).Host]
+ target, ok := nodeMap[job.Host]
if !ok {
- return fmt.Errorf("flash host %s is not a current cluster node", a.job(jobID).Host)
+ return RemoteFlashResult{}, fmt.Errorf("flash host %s is not a current cluster node", job.Host)
}
image := a.podImageForArch(target.Arch)
if image == "" {
- return fmt.Errorf("no runner image configured for arch %s", target.Arch)
+ return RemoteFlashResult{}, fmt.Errorf("no runner image configured for arch %s", target.Arch)
}
a.setJob(jobID, func(j *Job) {
- j.Stage = "flash"
+ j.Stage = "flash_pull"
j.StageStartedAt = time.Now().UTC()
- j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host)
+ j.Message = fmt.Sprintf("Pulling %s and preparing it for %s", artifactRef+":latest", prettyDeviceTarget(j.Device))
j.ProgressPct = 84
})
podName := fmt.Sprintf("metis-flash-%d", time.Now().UTC().UnixNano())
- logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, a.job(jobID).Node, a.job(jobID).Device, artifactRef))
+ logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, job.Node, job.Device, artifactRef))
if err != nil {
- return err
+ return RemoteFlashResult{}, err
}
- var payload map[string]any
- if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &payload); err == nil {
- a.setJob(jobID, func(j *Job) {
- if dest, ok := payload["dest_path"].(string); ok && dest != "" {
- j.Message = fmt.Sprintf("Wrote latest artifact to %s", dest)
- }
- })
+ var result RemoteFlashResult
+ if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &result); err != nil {
+ return RemoteFlashResult{}, fmt.Errorf("decode remote flash output: %w: %s", err, strings.TrimSpace(logs))
}
- return nil
+ if !result.Verified {
+ return RemoteFlashResult{}, fmt.Errorf("flash verification did not succeed for %s on %s", prettyDeviceTarget(job.Device), job.Host)
+ }
+ a.setJob(jobID, func(j *Job) {
+ if result.SizeBytes > 0 {
+ j.Written = result.SizeBytes
+ j.Total = result.SizeBytes
+ }
+ if strings.TrimSpace(result.VerificationSummary) != "" {
+ j.Message = result.VerificationSummary
+ return
+ }
+ if strings.TrimSpace(result.DestPath) != "" {
+ j.Message = fmt.Sprintf("Verified the latest image write at %s", result.DestPath)
+ }
+ })
+ return result, nil
}
func (a *App) heartbeatRemoteJob(jobID string) {
@@ -255,8 +324,8 @@ func (a *App) heartbeatRemoteJob(jobID string) {
stageStart = j.StartedAt
}
elapsed := time.Since(stageStart)
- switch j.Stage {
- case "build":
+ switch {
+ case j.Stage == "build":
progress, message := buildStageHeartbeat(j.Node, j.Builder, elapsed)
if progress > j.ProgressPct {
j.ProgressPct = progress
@@ -264,14 +333,14 @@ func (a *App) heartbeatRemoteJob(jobID string) {
if strings.TrimSpace(message) != "" {
j.Message = message
}
- case "preflight":
+ case j.Stage == "preflight":
if j.ProgressPct < 80 {
j.ProgressPct = 80
}
j.Message = fmt.Sprintf("Validating %s on %s and resolving the latest Harbor artifact", prettyDeviceTarget(j.Device), j.Host)
- case "flash":
- if j.Total > 0 && j.Written > 0 {
- actual := 88 + (float64(j.Written)/float64(j.Total))*10
+ case isFlashStage(j.Stage):
+ if j.Stage == "flash_write" && j.Total > 0 && j.Written > 0 {
+ actual := 92 + (float64(j.Written)/float64(j.Total))*6
if actual > 98 {
actual = 98
}
@@ -281,7 +350,7 @@ func (a *App) heartbeatRemoteJob(jobID string) {
j.Message = fmt.Sprintf("Writing %s of %s on %s", humanBytes(j.Written), humanBytes(j.Total), j.Host)
return
}
- progress, message := flashStageHeartbeat(j.Host, j.Artifact, elapsed)
+ progress, message := flashStagePhaseHeartbeat(j.Stage, j.Host, j.Artifact, elapsed)
if progress > j.ProgressPct {
j.ProgressPct = progress
}
diff --git a/pkg/service/remote_error_test.go b/pkg/service/remote_error_test.go
index f3af837..bef73a9 100644
--- a/pkg/service/remote_error_test.go
+++ b/pkg/service/remote_error_test.go
@@ -29,7 +29,7 @@ func TestRemoteWorkflowErrorBranches(t *testing.T) {
}
job = app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
- if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil {
+ if _, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil {
t.Fatal("expected flashArtifact error")
}
@@ -260,7 +260,7 @@ func TestFlashArtifactAndHeartbeatBranches(t *testing.T) {
app := remoteTestApp(t, nil)
job := app.newJob("replace", "titan-15", "missing-host", "/dev/sdz")
- if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") {
+ if _, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") {
t.Fatalf("expected missing host flashArtifact error, got %v", err)
}
@@ -283,7 +283,7 @@ func TestFlashArtifactAndHeartbeatBranches(t *testing.T) {
}
app.setJob(job.ID, func(j *Job) {
- j.Stage = "flash"
+ j.Stage = "flash_write"
j.ProgressPct = 80
j.Written = 120
j.Total = 100
@@ -341,7 +341,7 @@ func remoteWorkflowKubeServer(t *testing.T, opts remoteKubeOptions) *httptest.Se
buildPhase := defaultString(opts.buildPhase, "Succeeded")
buildMessage := defaultString(opts.buildMessage, `{"local_path":"/workspace/build/titan-15.img.xz","compressed":true,"size_bytes":1234,"build_tag":"build-1"}`)
flashPhase := defaultString(opts.flashPhase, "Succeeded")
- flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img"}`)
+ flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /var/tmp/metis-flash-test/titan-15.img; boot and writable partitions are present."}`)
nodes := opts.nodes
if nodes == nil {
nodes = []map[string]any{
diff --git a/pkg/service/remote_helpers.go b/pkg/service/remote_helpers.go
index cea8d5a..f520f52 100644
--- a/pkg/service/remote_helpers.go
+++ b/pkg/service/remote_helpers.go
@@ -28,17 +28,36 @@ func buildStageHeartbeat(node, builder string, elapsed time.Duration) (float64,
}
func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64, string) {
+ return flashStagePhaseHeartbeat("flash", host, artifact, elapsed)
+}
+
+func flashStagePhaseHeartbeat(stage, host, artifact string, elapsed time.Duration) (float64, string) {
seconds := elapsed.Seconds()
- switch {
- case seconds < 10:
- return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
- case seconds < 45:
- return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host)
+ switch strings.TrimSpace(stage) {
+ case "flash_pull":
+ return math.Min(88, ramp(seconds, 0, 12, 84, 88)), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
+ case "flash_prepare", "flash_unpack":
+ return math.Min(92, ramp(seconds, 0, 30, 88, 92)), fmt.Sprintf("Preparing the latest %s artifact for writing on %s", artifact, host)
+ case "flash_flush":
+ return math.Min(99, ramp(seconds, 0, 30, 98, 99)), fmt.Sprintf("Flushing the finished image write on %s", host)
+ case "flash_verify":
+ return math.Min(99.8, ramp(seconds, 0, 30, 99, 99.8)), fmt.Sprintf("Verifying the flashed recovery media on %s", host)
default:
- return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host)
+ switch {
+ case seconds < 10:
+ return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
+ case seconds < 45:
+ return ramp(seconds, 10, 45, 88, 96), fmt.Sprintf("Writing the latest image to the selected target on %s", host)
+ default:
+ return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host)
+ }
}
}
+func isFlashStage(stage string) bool {
+ return strings.HasPrefix(strings.TrimSpace(stage), "flash")
+}
+
func prettyDeviceTarget(path string) string {
switch {
case strings.HasPrefix(path, "hosttmp://"):
diff --git a/pkg/service/remote_helpers_test.go b/pkg/service/remote_helpers_test.go
index 20149d4..f237d68 100644
--- a/pkg/service/remote_helpers_test.go
+++ b/pkg/service/remote_helpers_test.go
@@ -384,3 +384,31 @@ func TestSelectBuilderHostTieBreaksByName(t *testing.T) {
t.Fatalf("expected alphabetical tie-breaker, got %s", node.Name)
}
}
+
+func TestFlashPhaseHeartbeatAndManagedPathHelpers(t *testing.T) {
+ cases := []struct {
+ stage string
+ minimum float64
+ phrase string
+ }{
+ {stage: "flash_pull", minimum: 84, phrase: "Pulling"},
+ {stage: "flash_prepare", minimum: 88, phrase: "Preparing"},
+ {stage: "flash_unpack", minimum: 90, phrase: "Preparing"},
+ {stage: "flash_flush", minimum: 98, phrase: "Flushing"},
+ {stage: "flash_verify", minimum: 99, phrase: "Verifying"},
+ {stage: "flash", minimum: 90, phrase: "Writing"},
+ }
+ for _, tc := range cases {
+ got, msg := flashStagePhaseHeartbeat(tc.stage, "titan-22", "registry.example/metis/titan-15:latest", 20*time.Second)
+ if got < tc.minimum || !strings.Contains(msg, tc.phrase) {
+ t.Fatalf("flashStagePhaseHeartbeat(%s) = %v %q", tc.stage, got, msg)
+ }
+ }
+
+ if !managedPathsContain("/var/log/pods_/var/tmp", "/var/tmp") {
+ t.Fatal("expected managedPathsContain to match child paths")
+ }
+ if managedPathsContain("/var/log/pods", "/home/atlas") {
+ t.Fatal("managedPathsContain should reject unrelated paths")
+ }
+}
diff --git a/pkg/service/remote_status.go b/pkg/service/remote_status.go
index 91aa742..f7eae73 100644
--- a/pkg/service/remote_status.go
+++ b/pkg/service/remote_status.go
@@ -19,6 +19,24 @@ type RemoteProgressUpdate struct {
TotalBytes int64 `json:"total_bytes,omitempty"`
}
+// RemoteFlashResult captures the verified outcome of a remote flash worker run.
+type RemoteFlashResult struct {
+ Node string `json:"node,omitempty"`
+ Device string `json:"device,omitempty"`
+ DestPath string `json:"dest_path,omitempty"`
+ SizeBytes int64 `json:"size_bytes,omitempty"`
+ Verified bool `json:"verified,omitempty"`
+ VerificationKind string `json:"verification_kind,omitempty"`
+ VerificationSummary string `json:"verification_summary,omitempty"`
+ BootPartition string `json:"boot_partition,omitempty"`
+ RootPartition string `json:"root_partition,omitempty"`
+ BootLabel string `json:"boot_label,omitempty"`
+ RootLabel string `json:"root_label,omitempty"`
+ BootFSType string `json:"boot_fstype,omitempty"`
+ RootFSType string `json:"root_fstype,omitempty"`
+ CheckedFiles []string `json:"checked_files,omitempty"`
+}
+
// ProgressLogLine formats a progress update for remote worker stdout.
func ProgressLogLine(update RemoteProgressUpdate) string {
data, err := json.Marshal(update)
diff --git a/pkg/service/server.go b/pkg/service/server.go
index 6f603f6..7a29cfa 100644
--- a/pkg/service/server.go
+++ b/pkg/service/server.go
@@ -30,6 +30,7 @@ func (a *App) Handler() http.Handler {
mux.HandleFunc("/api/state", a.withUIAuth(a.handleState))
mux.HandleFunc("/api/devices", a.withUIAuth(a.handleDevices))
mux.HandleFunc("/api/jobs/build", a.withUIAuth(a.handleBuild))
+ mux.HandleFunc("/api/jobs/flash", a.withUIAuth(a.handleFlash))
mux.HandleFunc("/api/jobs/replace", a.withUIAuth(a.handleReplace))
mux.HandleFunc("/api/sentinel/watch", a.withUIAuth(a.handleWatch))
mux.HandleFunc("/", a.withUIAuth(a.handleIndex))
@@ -106,6 +107,23 @@ func (a *App) handleBuild(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusAccepted, job)
}
+func (a *App) handleFlash(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+ values := requestValues(r)
+ node := values["node"]
+ host := values["host"]
+ device := values["device"]
+ job, err := a.Flash(node, host, device)
+ if err != nil {
+ http.Error(w, err.Error(), statusForJobError(err))
+ return
+ }
+ writeJSON(w, http.StatusAccepted, job)
+}
+
func (a *App) handleReplace(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
diff --git a/pkg/service/templates/metis.html b/pkg/service/templates/metis.html
index 9e9049c..3734d68 100644
--- a/pkg/service/templates/metis.html
+++ b/pkg/service/templates/metis.html
@@ -174,7 +174,7 @@
}
.actions{
display:grid;
- grid-template-columns:repeat(3,minmax(0,1fr));
+ grid-template-columns:repeat(2,minmax(0,1fr));
gap:.7rem;
margin-top:.9rem;
}
@@ -277,7 +277,7 @@
Replacement Run
- This UI is meant for the one-shot recovery path: build the node image, verify the card on the flash host, then write it and hand off only the physical swap.
+ Use the mode that matches the recovery step you are in: build a fresh image, flash the latest published image, or run the full build-and-flash path in one shot.