metis(ui): split build and flash workflows

This commit is contained in:
codex 2026-04-24 12:09:53 -03:00
parent bf9c514fd0
commit 56d31fa613
17 changed files with 1021 additions and 144 deletions

View File

@ -163,76 +163,6 @@ func remoteBuildCmd(args []string) {
writeStructuredResult(summary)
}
func remoteFlashCmd(args []string) {
fs := flag.NewFlagSet("remote-flash", flag.ExitOnError)
node := fs.String("node", "", "target node")
device := fs.String("device", "", "target device path or test sink")
artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag")
workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory")
harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host")
harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username")
harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password")
hostTmpDir := fs.String("host-tmp-dir", "/host-tmp", "mounted host tmp dir for test writes")
fs.Parse(args)
if *node == "" || *device == "" || *artifactRef == "" {
fatalf("--node, --device, and --artifact-ref are required")
}
if err := os.MkdirAll(*workDir, 0o755); err != nil {
fatalf("mkdir workdir: %v", err)
}
emitStageProgress("flash", 84, fmt.Sprintf("Pulling the latest Harbor artifact for %s", *node))
if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil {
fatalf("oras login: %v", err)
}
if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil {
fatalf("oras pull: %v", err)
}
emitStageProgress("flash", 88, fmt.Sprintf("Preparing the downloaded image for %s", *node))
imagePath, compressed, err := resolvePulledArtifact(*workDir)
if err != nil {
fatalf("resolve artifact: %v", err)
}
rawImage := imagePath
if compressed {
emitStageProgress("flash", 90, fmt.Sprintf("Decompressing the image for %s before writing", *node))
rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node))
cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage))
if out, err := cmd.CombinedOutput(); err != nil {
fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out)))
}
}
destPath := *device
if strings.HasPrefix(destPath, "hosttmp://") {
if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil {
fatalf("mkdir host tmp dir: %v", err)
}
destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node))
}
emitStageProgress("flash", 92, fmt.Sprintf("Writing the latest image for %s to %s", *node, destPath))
writeEmitter := newProgressEmitter("flash", 92, 98, fmt.Sprintf("Writing the latest image for %s", *node), true)
if err := writer.WriteImageWithProgress(context.Background(), rawImage, destPath, writeEmitter); err != nil {
fatalf("write image: %v", err)
}
emitStageProgress("flash", 99, fmt.Sprintf("Flushing the finished image for %s", *node))
_ = exec.Command("sync").Run()
if strings.HasPrefix(destPath, "/dev/") {
_ = exec.Command("blockdev", "--flushbufs", destPath).Run()
}
info, err := os.Stat(destPath)
if err != nil {
fatalf("stat destination: %v", err)
}
writeStructuredResult(map[string]any{
"node": *node,
"device": *device,
"dest_path": destPath,
"size_bytes": info.Size(),
})
}
func writeStructuredResult(payload any) {
data, err := json.Marshal(payload)
if err != nil {

304
cmd/metis/remote_flash.go Normal file
View File

@ -0,0 +1,304 @@
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"metis/pkg/service"
"metis/pkg/writer"
)
type flashPartitionTable struct {
PartitionTable struct {
Partitions []flashTablePartition `json:"partitions"`
} `json:"partitiontable"`
}
type flashTablePartition struct {
Type string `json:"type"`
}
type flashBlockDevicePayload struct {
Blockdevices []flashBlockDevice `json:"blockdevices"`
}
type flashBlockDevice struct {
Path string `json:"path"`
Type string `json:"type"`
FSType string `json:"fstype"`
Label string `json:"label"`
Children []flashBlockDevice `json:"children"`
}
var requiredBootFiles = []string{"config.txt", "cmdline.txt", "boot.scr"}
func remoteFlashCmd(args []string) {
fs := flag.NewFlagSet("remote-flash", flag.ExitOnError)
node := fs.String("node", "", "target node")
device := fs.String("device", "", "target device path or test sink")
artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag")
workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory")
harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host")
harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username")
harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password")
hostTmpDir := fs.String("host-tmp-dir", "/host-tmp", "mounted host tmp dir for test writes")
fs.Parse(args)
if *node == "" || *device == "" || *artifactRef == "" {
fatalf("--node, --device, and --artifact-ref are required")
}
if err := os.MkdirAll(*workDir, 0o755); err != nil {
fatalf("mkdir workdir: %v", err)
}
emitStageProgress("flash_pull", 84, fmt.Sprintf("Pulling the latest Harbor artifact for %s", *node))
if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil {
fatalf("oras login: %v", err)
}
if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil {
fatalf("oras pull: %v", err)
}
emitStageProgress("flash_prepare", 88, fmt.Sprintf("Preparing the downloaded image for %s", *node))
imagePath, compressed, err := resolvePulledArtifact(*workDir)
if err != nil {
fatalf("resolve artifact: %v", err)
}
rawImage := imagePath
if compressed {
emitStageProgress("flash_unpack", 90, fmt.Sprintf("Decompressing the image for %s before writing", *node))
rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node))
cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage))
if out, err := cmd.CombinedOutput(); err != nil {
fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out)))
}
}
destPath := *device
if strings.HasPrefix(destPath, "hosttmp://") {
if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil {
fatalf("mkdir host tmp dir: %v", err)
}
destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node))
}
info, err := os.Stat(rawImage)
if err != nil {
fatalf("stat raw image: %v", err)
}
imageSize := info.Size()
emitStageProgress("flash_write", 92, fmt.Sprintf("Writing the latest image for %s to %s", *node, destPath))
writeEmitter := newProgressEmitter("flash_write", 92, 98, fmt.Sprintf("Writing the latest image for %s", *node), true)
if err := writer.WriteImageWithProgress(context.Background(), rawImage, destPath, writeEmitter); err != nil {
fatalf("write image: %v", err)
}
flushFlashTarget(destPath, *node)
result, err := verifyFlashDestination(destPath)
if err != nil {
fatalf("verify flash output: %v", err)
}
result.Node = *node
result.Device = *device
result.DestPath = destPath
result.SizeBytes = imageSize
writeStructuredResult(result)
}
func flushFlashTarget(destPath, node string) {
emitStageProgress("flash_flush", 98.5, fmt.Sprintf("Flushing the finished image for %s", node))
_ = exec.Command("sync").Run()
if !strings.HasPrefix(destPath, "/dev/") {
return
}
_ = exec.Command("blockdev", "--flushbufs", destPath).Run()
_ = exec.Command("blockdev", "--rereadpt", destPath).Run()
_ = exec.Command("partprobe", destPath).Run()
_ = exec.Command("udevadm", "settle", "--timeout=10").Run()
}
func verifyFlashDestination(destPath string) (service.RemoteFlashResult, error) {
emitStageProgress("flash_verify", 99.2, fmt.Sprintf("Verifying the flashed recovery media at %s", destPath))
if strings.HasPrefix(destPath, "/dev/") {
return verifyBlockDeviceFlash(destPath)
}
return verifyImageFileFlash(destPath)
}
func verifyImageFileFlash(destPath string) (service.RemoteFlashResult, error) {
table, err := readFlashPartitionTable(destPath)
if err != nil {
return service.RemoteFlashResult{}, err
}
hasBoot := false
hasRoot := false
for _, part := range table.PartitionTable.Partitions {
if isBootPartitionType(part.Type) {
hasBoot = true
}
if isLinuxPartitionType(part.Type) {
hasRoot = true
}
}
if !hasBoot || !hasRoot {
return service.RemoteFlashResult{}, fmt.Errorf("image %s does not expose the expected boot and writable partitions", destPath)
}
return service.RemoteFlashResult{
Verified: true,
VerificationKind: "image-file",
VerificationSummary: fmt.Sprintf("Verified image layout at %s; boot and writable partitions are present.", destPath),
}, nil
}
func verifyBlockDeviceFlash(destPath string) (service.RemoteFlashResult, error) {
deadline := time.Now().Add(15 * time.Second)
var lastErr error
for time.Now().Before(deadline) {
parts, err := readBlockDevicePartitions(destPath)
if err == nil {
boot, root, classifyErr := classifyFlashPartitions(parts)
if classifyErr == nil {
checkedFiles, verifyErr := verifyBootPartitionFiles(boot.Path)
if verifyErr == nil {
bootLabel := firstNonEmpty(boot.Label, filepath.Base(boot.Path))
rootLabel := firstNonEmpty(root.Label, filepath.Base(root.Path))
return service.RemoteFlashResult{
Verified: true,
VerificationKind: "block-device",
VerificationSummary: fmt.Sprintf("Verified %s; %s and %s are present and the boot files look correct.", destPath, bootLabel, rootLabel),
BootPartition: boot.Path,
RootPartition: root.Path,
BootLabel: boot.Label,
RootLabel: root.Label,
BootFSType: boot.FSType,
RootFSType: root.FSType,
CheckedFiles: checkedFiles,
}, nil
}
lastErr = verifyErr
} else {
lastErr = classifyErr
}
} else {
lastErr = err
}
_ = exec.Command("blockdev", "--rereadpt", destPath).Run()
_ = exec.Command("partprobe", destPath).Run()
_ = exec.Command("udevadm", "settle", "--timeout=10").Run()
time.Sleep(time.Second)
}
if lastErr == nil {
lastErr = fmt.Errorf("timed out waiting for the flashed partitions on %s", destPath)
}
return service.RemoteFlashResult{}, lastErr
}
func readFlashPartitionTable(destPath string) (flashPartitionTable, error) {
out, err := exec.Command("sfdisk", "-J", destPath).CombinedOutput()
if err != nil {
return flashPartitionTable{}, fmt.Errorf("sfdisk -J %s: %v: %s", destPath, err, strings.TrimSpace(string(out)))
}
var table flashPartitionTable
if err := json.Unmarshal(out, &table); err != nil {
return flashPartitionTable{}, fmt.Errorf("decode partition table for %s: %w", destPath, err)
}
return table, nil
}
func readBlockDevicePartitions(destPath string) ([]flashBlockDevice, error) {
out, err := exec.Command("lsblk", "-J", "-o", "PATH,TYPE,FSTYPE,LABEL", destPath).CombinedOutput()
if err != nil {
return nil, fmt.Errorf("lsblk %s: %v: %s", destPath, err, strings.TrimSpace(string(out)))
}
var payload flashBlockDevicePayload
if err := json.Unmarshal(out, &payload); err != nil {
return nil, fmt.Errorf("decode lsblk output for %s: %w", destPath, err)
}
for _, device := range payload.Blockdevices {
if device.Path == destPath {
return device.Children, nil
}
}
if len(payload.Blockdevices) == 1 {
return payload.Blockdevices[0].Children, nil
}
return nil, fmt.Errorf("lsblk did not report partitions for %s", destPath)
}
func classifyFlashPartitions(parts []flashBlockDevice) (flashBlockDevice, flashBlockDevice, error) {
var boot flashBlockDevice
var root flashBlockDevice
for _, part := range parts {
if part.Type != "part" {
continue
}
normalizedLabel := strings.ToLower(strings.TrimSpace(part.Label))
normalizedFS := strings.ToLower(strings.TrimSpace(part.FSType))
if boot.Path == "" && (normalizedLabel == "system-boot" || normalizedFS == "vfat" || normalizedFS == "fat" || normalizedFS == "fat32") {
boot = part
continue
}
if root.Path == "" && (normalizedLabel == "writable" || normalizedFS == "ext4") {
root = part
}
}
if boot.Path == "" {
return flashBlockDevice{}, flashBlockDevice{}, fmt.Errorf("could not find a boot partition with a FAT filesystem")
}
if root.Path == "" {
return flashBlockDevice{}, flashBlockDevice{}, fmt.Errorf("could not find a writable ext4 partition")
}
return boot, root, nil
}
func verifyBootPartitionFiles(partitionPath string) ([]string, error) {
mountDir, err := os.MkdirTemp("", "metis-boot-")
if err != nil {
return nil, err
}
defer os.RemoveAll(mountDir)
cmd := exec.Command("mount", "-o", "ro", partitionPath, mountDir)
if out, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("mount %s read-only: %v: %s", partitionPath, err, strings.TrimSpace(string(out)))
}
defer exec.Command("umount", mountDir).Run()
checked := make([]string, 0, len(requiredBootFiles))
for _, name := range requiredBootFiles {
if _, err := os.Stat(filepath.Join(mountDir, name)); err != nil {
return nil, fmt.Errorf("boot partition %s is missing %s", partitionPath, name)
}
checked = append(checked, name)
}
return checked, nil
}
func isBootPartitionType(partType string) bool {
normalized := strings.ToLower(strings.TrimSpace(partType))
switch normalized {
case "b", "c", "0b", "0c", "ef", "ef00":
return true
}
return normalized == "c12a7328-f81f-11d2-ba4b-00a0c93ec93b"
}
func isLinuxPartitionType(partType string) bool {
normalized := strings.ToLower(strings.TrimSpace(partType))
switch normalized {
case "83", "8300":
return true
}
return normalized == "0fc63daf-8483-4772-8e79-3d69d8477de4"
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}

View File

@ -0,0 +1,246 @@
package main
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestVerifyFlashDestinationForImageFilesAndBlockDevices(t *testing.T) {
t.Run("image file success and helpers", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"sfdisk": `cat <<'JSON'
{"partitiontable":{"partitions":[{"type":"ef00"},{"type":"8300"}]}}
JSON`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
result, err := verifyFlashDestination(filepath.Join(t.TempDir(), "titan-15.img"))
if err != nil {
t.Fatalf("verifyFlashDestination(image): %v", err)
}
if !result.Verified || result.VerificationKind != "image-file" {
t.Fatalf("unexpected image verification result: %#v", result)
}
if !isBootPartitionType("ef00") || !isLinuxPartitionType("8300") {
t.Fatal("expected helper partition type recognizers to accept GPT aliases")
}
if got := firstNonEmpty("", " writable ", "ignored"); got != "writable" {
t.Fatalf("firstNonEmpty = %q", got)
}
})
t.Run("image file missing writable partition", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"sfdisk": `cat <<'JSON'
{"partitiontable":{"partitions":[{"type":"ef00"}]}}
JSON`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := verifyImageFileFlash(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "boot and writable partitions") {
t.Fatalf("expected missing partition error, got %v", err)
}
})
t.Run("image file missing boot partition", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"sfdisk": `cat <<'JSON'
{"partitiontable":{"partitions":[{"type":"8300"}]}}
JSON`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := verifyImageFileFlash(filepath.Join(t.TempDir(), "broken-boot.img")); err == nil || !strings.Contains(err.Error(), "boot and writable partitions") {
t.Fatalf("expected missing boot partition error, got %v", err)
}
})
t.Run("block device success", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"lsblk": `cat <<'JSON'
{"blockdevices":[{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"},{"path":"/dev/sdk2","type":"part","fstype":"ext4","label":"writable"}]}]}
JSON`,
"mount": `mount_dir="${4:-}"
mkdir -p "${mount_dir}"
printf 'ok' > "${mount_dir}/config.txt"
printf 'console=ttyAMA0' > "${mount_dir}/cmdline.txt"
printf 'boot script' > "${mount_dir}/boot.scr"`,
"umount": `exit 0`,
"blockdev": `exit 0`,
"partprobe": `exit 0`,
"udevadm": `exit 0`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
result, err := verifyFlashDestination("/dev/sdk")
if err != nil {
t.Fatalf("verifyFlashDestination(block): %v", err)
}
if !result.Verified || result.VerificationKind != "block-device" {
t.Fatalf("unexpected block-device verification result: %#v", result)
}
if result.BootPartition != "/dev/sdk1" || result.RootPartition != "/dev/sdk2" {
t.Fatalf("unexpected partition details: %#v", result)
}
if len(result.CheckedFiles) != 3 {
t.Fatalf("expected checked boot files, got %#v", result.CheckedFiles)
}
})
t.Run("block device missing boot file", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"lsblk": `cat <<'JSON'
{"blockdevices":[{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"},{"path":"/dev/sdk2","type":"part","fstype":"ext4","label":"writable"}]}]}
JSON`,
"mount": `mount_dir="${4:-}"
mkdir -p "${mount_dir}"
printf 'ok' > "${mount_dir}/config.txt"
printf 'console=ttyAMA0' > "${mount_dir}/cmdline.txt"`,
"umount": `exit 0`,
"blockdev": `exit 0`,
"partprobe": `exit 0`,
"udevadm": `exit 0`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := verifyBlockDeviceFlash("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "missing boot.scr") {
t.Fatalf("expected missing boot file error, got %v", err)
}
})
}
func TestFlushFlashTargetAndClassifyHelpers(t *testing.T) {
logPath := filepath.Join(t.TempDir(), "commands.log")
tools := fakeCommandDir(t, map[string]string{
"sync": `printf 'sync %s\n' "$*" >> "` + logPath + `"`,
"blockdev": `printf 'blockdev %s\n' "$*" >> "` + logPath + `"`,
"partprobe": `printf 'partprobe %s\n' "$*" >> "` + logPath + `"`,
"udevadm": `printf 'udevadm %s\n' "$*" >> "` + logPath + `"`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
flushFlashTarget("/tmp/test.img", "titan-15")
flushFlashTarget("/dev/sdk", "titan-15")
logged, err := os.ReadFile(logPath)
if err != nil {
t.Fatalf("read flush log: %v", err)
}
text := string(logged)
if strings.Count(text, "sync ") != 2 {
t.Fatalf("expected sync for file and block targets, got %q", text)
}
for _, want := range []string{"blockdev --flushbufs /dev/sdk", "blockdev --rereadpt /dev/sdk", "partprobe /dev/sdk", "udevadm settle --timeout=10"} {
if !strings.Contains(text, want) {
t.Fatalf("expected %q in %q", want, text)
}
}
if _, _, err := classifyFlashPartitions([]flashBlockDevice{{Path: "/dev/sdk1", Type: "part", FSType: "vfat", Label: "system-boot"}}); err == nil || !strings.Contains(err.Error(), "writable ext4") {
t.Fatalf("expected classify failure, got %v", err)
}
}
func TestReadFlashPartitionTableAndBlockPartitionErrors(t *testing.T) {
t.Run("partition table decode error", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{"sfdisk": `printf '{'`})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := readFlashPartitionTable(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "decode partition table") {
t.Fatalf("expected partition table decode error, got %v", err)
}
})
t.Run("lsblk decode error", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{"lsblk": `printf '{'`})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "decode lsblk output") {
t.Fatalf("expected lsblk decode error, got %v", err)
}
})
t.Run("lsblk finds explicit device match", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"lsblk": `cat <<'JSON'
{"blockdevices":[{"path":"/dev/other","children":[{"path":"/dev/other1","type":"part"}]},{"path":"/dev/sdk","children":[{"path":"/dev/sdk1","type":"part","fstype":"vfat","label":"system-boot"}]}]}
JSON`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
parts, err := readBlockDevicePartitions("/dev/sdk")
if err != nil {
t.Fatalf("readBlockDevicePartitions match: %v", err)
}
if len(parts) != 1 || parts[0].Path != "/dev/sdk1" {
t.Fatalf("unexpected matching partitions: %#v", parts)
}
})
t.Run("lsblk missing device match", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"lsblk": `cat <<'JSON'
{"blockdevices":[{"path":"/dev/other","children":[]},{"path":"/dev/another","children":[]}]}
JSON`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "did not report partitions") {
t.Fatalf("expected missing block-device match error, got %v", err)
}
})
}
func TestVerifyBootPartitionFilesMountFailure(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"mount": `printf 'mount failed' >&2; exit 9`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := verifyBootPartitionFiles("/dev/sdk1"); err == nil || !strings.Contains(err.Error(), "mount /dev/sdk1 read-only") {
t.Fatalf("expected mount failure, got %v", err)
}
}
func TestRemoteFlashHelperAdditionalFailureBranches(t *testing.T) {
t.Run("partition table command failure", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{"sfdisk": `printf 'bad disk' >&2; exit 7`})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := readFlashPartitionTable(filepath.Join(t.TempDir(), "broken.img")); err == nil || !strings.Contains(err.Error(), "sfdisk -J") {
t.Fatalf("expected sfdisk failure, got %v", err)
}
})
t.Run("lsblk command failure", func(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{"lsblk": `printf 'lsblk failed' >&2; exit 4`})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
if _, err := readBlockDevicePartitions("/dev/sdk"); err == nil || !strings.Contains(err.Error(), "lsblk /dev/sdk") {
t.Fatalf("expected lsblk command failure, got %v", err)
}
})
t.Run("boot partition classification failure", func(t *testing.T) {
if _, _, err := classifyFlashPartitions([]flashBlockDevice{{Path: "/dev/sdk2", Type: "part", FSType: "ext4", Label: "writable"}}); err == nil || !strings.Contains(err.Error(), "boot partition") {
t.Fatalf("expected missing boot partition error, got %v", err)
}
})
t.Run("firstNonEmpty returns empty when every value is blank", func(t *testing.T) {
if got := firstNonEmpty("", " "); got != "" {
t.Fatalf("expected empty firstNonEmpty result, got %q", got)
}
})
}
func TestReadBlockDevicePartitionsFallsBackToSingleReturnedDevice(t *testing.T) {
tools := fakeCommandDir(t, map[string]string{
"lsblk": `cat <<'JSON'
{"blockdevices":[{"path":"/dev/mmcblk0","children":[{"path":"/dev/mmcblk0p1","type":"part","fstype":"vfat","label":"system-boot"}]}]}
JSON`,
})
t.Setenv("PATH", tools+string(os.PathListSeparator)+os.Getenv("PATH"))
parts, err := readBlockDevicePartitions("/dev/sdk")
if err != nil {
t.Fatalf("readBlockDevicePartitions fallback: %v", err)
}
if len(parts) != 1 || parts[0].Path != "/dev/mmcblk0p1" {
t.Fatalf("unexpected fallback partitions: %#v", parts)
}
}

View File

@ -221,6 +221,31 @@ func (a *App) Build(node string) (*Job, error) {
return job, nil
}
// Flash starts a background flash-only workflow for the latest published node image.
func (a *App) Flash(node, host, device string) (*Job, error) {
if host == "" {
host = a.settings.DefaultFlashHost
}
if err := a.ensureReplacementReady(node); err != nil {
return nil, err
}
if active := a.activeJobForNode(node); active != nil {
return nil, &activeNodeJobError{Node: node, Kind: active.Kind, JobID: active.ID}
}
if summary, ok := a.artifacts()[node]; !ok || strings.TrimSpace(summary.Ref) == "" {
return nil, fmt.Errorf("no published image recorded for %s yet; run a build first", node)
}
if _, err := a.ensureDevice(host, device); err != nil {
return nil, err
}
job, err := a.reserveJob("flash", node, host, device)
if err != nil {
return nil, err
}
go a.runFlash(job)
return job, nil
}
// Replace starts a background build+flash workflow for a node.
func (a *App) Replace(node, host, device string) (*Job, error) {
if host == "" {

View File

@ -68,7 +68,7 @@ func (a *App) activeJobForNodeLocked(node string) *Job {
continue
}
switch job.Kind {
case "build", "replace":
case "build", "flash", "replace":
default:
continue
}

View File

@ -407,7 +407,7 @@ func TestServiceClusterAndRemotePodBranches(t *testing.T) {
"metadata": map[string]any{"name": filepath.Base(r.URL.Path)},
"status": map[string]any{
"phase": "Succeeded",
"message": `{"dest_path":"/tmp/out.img"}`,
"message": `{"dest_path":"/tmp/out.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /tmp/out.img; boot and writable partitions are present."}`,
"reason": "Completed",
},
})

View File

@ -0,0 +1,182 @@
package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestFlashOnlyWorkflowRequiresPublishedArtifact(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
if _, err := app.Flash("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "run a build first") {
t.Fatalf("expected missing artifact error, got %v", err)
}
}
func TestFlashOnlyWorkflowCompletesAndRecordsVerification(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","verified":true,"verification_kind":"block-device","verification_summary":"Verified /dev/sdz; system-boot and writable are present and the boot files look correct.","boot_partition":"/dev/sdz1","root_partition":"/dev/sdz2","boot_label":"system-boot","root_label":"writable","boot_fstype":"vfat","root_fstype":"ext4","checked_files":["config.txt","cmdline.txt","boot.scr"]}`})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
job, err := app.Flash("titan-15", "titan-22", "/dev/sdz")
if err != nil {
t.Fatalf("Flash: %v", err)
}
waitForJobState(t, app, job.ID, JobDone)
got := app.job(job.ID)
if got == nil || got.Kind != "flash" || got.Status != JobDone {
t.Fatalf("unexpected flash job: %#v", got)
}
if !strings.Contains(got.Message, "Move the card into titan-15") {
t.Fatalf("unexpected completion message: %#v", got)
}
if got.ProgressPct != 100 {
t.Fatalf("expected complete flash progress, got %#v", got)
}
events := app.recentEvents(5)
if len(events) == 0 || events[0].Kind != "image.flash" {
t.Fatalf("expected flash event, got %#v", events)
}
if verified, ok := events[0].Details["verified"].(bool); !ok || !verified {
t.Fatalf("expected verified flash details, got %#v", events[0].Details)
}
if kind, ok := events[0].Details["verification_kind"].(string); !ok || kind != "block-device" {
t.Fatalf("expected block-device verification, got %#v", events[0].Details)
}
if bootPartition, ok := events[0].Details["boot_partition"].(string); !ok || bootPartition != "/dev/sdz1" {
t.Fatalf("expected boot partition details, got %#v", events[0].Details)
}
}
func TestHandleFlashRouteStartsFlashOnlyJob(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
handler := app.Handler()
req := httptest.NewRequest(http.MethodPost, "/api/jobs/flash", strings.NewReader(`{"node":"titan-15","host":"titan-22","device":"/dev/sdz"}`))
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-Auth-Request-User", "brad")
req.Header.Set("X-Auth-Request-Groups", "admin")
resp := httptest.NewRecorder()
handler.ServeHTTP(resp, req)
if resp.Code != http.StatusAccepted {
t.Fatalf("flash route response: %d %s", resp.Code, resp.Body.String())
}
var job Job
if err := json.Unmarshal(resp.Body.Bytes(), &job); err != nil {
t.Fatalf("decode flash job: %v", err)
}
waitForJobState(t, app, job.ID, JobDone)
if got := app.job(job.ID); got == nil || got.Kind != "flash" {
t.Fatalf("expected flash job, got %#v", got)
}
}
func TestFlashAndReplaceDefaultHostBranches(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
harbor := fakeHarborServer(t, true)
installKubeFactory(t, kube)
app := remoteTestApp(t, harbor)
app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
flashJob, err := app.Flash("titan-15", "", hostTmpDevicePath)
if err != nil {
t.Fatalf("Flash default host: %v", err)
}
waitForJobState(t, app, flashJob.ID, JobDone)
if got := app.job(flashJob.ID); got == nil || got.Host != "titan-22" || !strings.Contains(got.Message, "host /tmp") {
t.Fatalf("unexpected flash default host job: %#v", got)
}
replaceJob, err := app.Replace("titan-15", "", hostTmpDevicePath)
if err != nil {
t.Fatalf("Replace default host: %v", err)
}
waitForJobState(t, app, replaceJob.ID, JobDone)
if got := app.job(replaceJob.ID); got == nil || got.Host != "titan-22" {
t.Fatalf("unexpected replace default host job: %#v", got)
}
}
func TestFlashRejectsDuplicateActiveNodeJob(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
if _, err := app.Flash("titan-15", "titan-22", "/dev/sdz"); err == nil || !strings.Contains(err.Error(), "already has an active") {
t.Fatalf("expected flash conflict, got %v", err)
}
}
func TestFlashArtifactUpdatesProgressWhenVerificationSummaryIsMissing(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","size_bytes":4096,"verified":true}`})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
job := app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
result, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15")
if err != nil {
t.Fatalf("flashArtifact: %v", err)
}
if result.SizeBytes != 4096 {
t.Fatalf("unexpected flash result: %#v", result)
}
got := app.job(job.ID)
if got == nil || got.Written != 4096 || got.Total != 4096 || !strings.Contains(got.Message, "Verified the latest image write") {
t.Fatalf("unexpected flash artifact state: %#v", got)
}
}
func TestFlashOnlyWorkflowFailsVerification(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{flashMessage: `{"dest_path":"/dev/sdz","verified":false}`})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
job, err := app.Flash("titan-15", "titan-22", "/dev/sdz")
if err != nil {
t.Fatalf("Flash: %v", err)
}
deadline := time.Now().Add(5 * time.Second)
for time.Now().Before(deadline) {
if got := app.job(job.ID); got != nil && got.Status == JobError {
if !strings.Contains(got.Error, "verification did not succeed") {
t.Fatalf("unexpected flash verification error: %#v", got)
}
return
}
time.Sleep(10 * time.Millisecond)
}
t.Fatalf("flash job %s never failed verification: %#v", job.ID, app.job(job.ID))
}
func TestBuildFlashAndReplaceRejectInvalidInputs(t *testing.T) {
kube := remoteWorkflowKubeServer(t, remoteKubeOptions{})
installKubeFactory(t, kube)
app := remoteTestApp(t, nil)
app.artifactStore["titan-15"] = ArtifactSummary{Node: "titan-15", Ref: "registry.example/metis/titan-15:latest"}
if _, err := app.Build("missing-node"); err == nil || !strings.Contains(err.Error(), "missing-node") {
t.Fatalf("expected build inventory error, got %v", err)
}
if _, err := app.Flash("titan-15", "titan-22", "/dev/sda"); err == nil || !strings.Contains(err.Error(), "not a current flash candidate") {
t.Fatalf("expected flash device validation error, got %v", err)
}
if _, err := app.Replace("titan-15", "titan-22", "/dev/sda"); err == nil || !strings.Contains(err.Error(), "not a current flash candidate") {
t.Fatalf("expected replace device validation error, got %v", err)
}
}

View File

@ -74,7 +74,7 @@ func (a *App) RefreshDevices(host string) ([]Device, error) {
}
func (a *App) runBuild(job *Job, flash bool) {
nodeSpec, class, err := a.inventory.FindNode(job.Node)
_, class, err := a.inventory.FindNode(job.Node)
if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordBuild(job.Node, "error")
@ -158,18 +158,42 @@ func (a *App) runBuild(job *Job, flash bool) {
return
}
a.setJob(job.ID, func(j *Job) {
j.Stage = "preflight"
j.StageStartedAt = time.Now().UTC()
j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host)
j.ProgressPct = 78
j.Artifact = artifactRef + ":latest"
})
if _, err := a.ensureDevice(job.Host, job.Device); err != nil {
result, err := a.runFlashSequence(job, artifactRef)
if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordFlash(job.Node, job.Host, "error")
return
}
a.metrics.RecordFlash(job.Node, job.Host, "ok")
a.appendFlashEvent(job, artifactRef, result)
a.completeFlashJob(job.ID, artifactRef, result)
}
func (a *App) runFlash(job *Job) {
artifactRef := a.artifactRepo(job.Node)
result, err := a.runFlashSequence(job, artifactRef)
if err != nil {
a.failJob(job.ID, err)
a.metrics.RecordFlash(job.Node, job.Host, "error")
return
}
a.metrics.RecordFlash(job.Node, job.Host, "ok")
a.appendFlashEvent(job, artifactRef, result)
a.completeFlashJob(job.ID, artifactRef, result)
}
func (a *App) runFlashSequence(job *Job, artifactRef string) (RemoteFlashResult, error) {
a.setJob(job.ID, func(j *Job) {
j.Status = JobRunning
j.Stage = "preflight"
j.StageStartedAt = time.Now().UTC()
j.Message = fmt.Sprintf("Validating %s and preparing the latest Harbor artifact for %s", prettyDeviceTarget(j.Device), j.Host)
j.ProgressPct = 80
j.Artifact = artifactRef + ":latest"
})
if _, err := a.ensureDevice(job.Host, job.Device); err != nil {
return RemoteFlashResult{}, err
}
if !strings.HasPrefix(job.Device, "hosttmp://") {
if err := deleteNodeObject(job.Node); err != nil {
a.appendEvent(Event{
@ -180,66 +204,111 @@ func (a *App) runBuild(job *Job, flash bool) {
})
}
}
if err := a.flashArtifact(job.ID, artifactRef); err != nil {
a.failJob(job.ID, err)
a.metrics.RecordFlash(job.Node, job.Host, "error")
return
return a.flashArtifact(job.ID, artifactRef)
}
func (a *App) appendFlashEvent(job *Job, artifactRef string, result RemoteFlashResult) {
summary := fmt.Sprintf("Flashed and verified %s latest image on %s", job.Node, job.Host)
if strings.HasPrefix(job.Device, "hosttmp://") {
summary = fmt.Sprintf("Verified %s latest image on %s host scratch", job.Node, job.Host)
}
details := map[string]any{
"node": job.Node,
"device": job.Device,
"host": job.Host,
"artifact": artifactRef + ":latest",
"dest_path": result.DestPath,
"verified": result.Verified,
"verification_kind": result.VerificationKind,
}
if strings.TrimSpace(result.BootPartition) != "" {
details["boot_partition"] = result.BootPartition
}
if strings.TrimSpace(result.RootPartition) != "" {
details["root_partition"] = result.RootPartition
}
if len(result.CheckedFiles) > 0 {
details["checked_files"] = append([]string{}, result.CheckedFiles...)
}
a.metrics.RecordFlash(job.Node, job.Host, "ok")
a.appendEvent(Event{
Time: time.Now().UTC(),
Kind: "image.flash",
Summary: fmt.Sprintf("Flashed %s latest image on %s", job.Node, job.Host),
Details: map[string]any{"node": job.Node, "device": job.Device, "host": job.Host, "artifact": artifactRef + ":latest"},
Summary: summary,
Details: details,
})
a.completeJob(job.ID, func(j *Job) {
j.Stage = "complete"
if strings.HasPrefix(j.Device, "hosttmp://") {
j.Message = fmt.Sprintf("Test flash complete on %s host /tmp.", j.Host)
} else {
j.Message = fmt.Sprintf("Flash complete on %s. Move the card into %s and power-cycle it.", j.Host, j.Node)
}
func (a *App) completeFlashJob(jobID, artifactRef string, result RemoteFlashResult) {
a.completeJob(jobID, func(j *Job) {
j.Stage = "complete"
j.ProgressPct = 100
j.Artifact = artifactRef + ":latest"
if strings.HasPrefix(j.Device, "hosttmp://") {
if strings.TrimSpace(result.VerificationSummary) != "" {
j.Message = fmt.Sprintf("%s Test flash complete on %s host /tmp.", strings.TrimRight(result.VerificationSummary, "."), j.Host)
return
}
j.Message = fmt.Sprintf("Verified test flash on %s host /tmp.", j.Host)
return
}
if strings.TrimSpace(result.VerificationSummary) != "" {
j.Message = fmt.Sprintf("%s Move the card into %s and power-cycle it.", strings.TrimRight(result.VerificationSummary, "."), j.Node)
return
}
j.Message = fmt.Sprintf("Flash verified on %s. Move the card into %s and power-cycle it.", j.Host, j.Node)
})
_ = nodeSpec
}
func (a *App) flashArtifact(jobID, artifactRef string) error {
func (a *App) flashArtifact(jobID, artifactRef string) (RemoteFlashResult, error) {
job := a.job(jobID)
if job == nil {
return RemoteFlashResult{}, fmt.Errorf("job %s no longer exists", jobID)
}
nodes := clusterNodes()
nodeMap := map[string]clusterNode{}
for _, node := range nodes {
nodeMap[node.Name] = node
}
target, ok := nodeMap[a.job(jobID).Host]
target, ok := nodeMap[job.Host]
if !ok {
return fmt.Errorf("flash host %s is not a current cluster node", a.job(jobID).Host)
return RemoteFlashResult{}, fmt.Errorf("flash host %s is not a current cluster node", job.Host)
}
image := a.podImageForArch(target.Arch)
if image == "" {
return fmt.Errorf("no runner image configured for arch %s", target.Arch)
return RemoteFlashResult{}, fmt.Errorf("no runner image configured for arch %s", target.Arch)
}
a.setJob(jobID, func(j *Job) {
j.Stage = "flash"
j.Stage = "flash_pull"
j.StageStartedAt = time.Now().UTC()
j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host)
j.Message = fmt.Sprintf("Pulling %s and preparing it for %s", artifactRef+":latest", prettyDeviceTarget(j.Device))
j.ProgressPct = 84
})
podName := fmt.Sprintf("metis-flash-%d", time.Now().UTC().UnixNano())
logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, a.job(jobID).Node, a.job(jobID).Device, artifactRef))
logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, job.Node, job.Device, artifactRef))
if err != nil {
return err
return RemoteFlashResult{}, err
}
var result RemoteFlashResult
if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &result); err != nil {
return RemoteFlashResult{}, fmt.Errorf("decode remote flash output: %w: %s", err, strings.TrimSpace(logs))
}
if !result.Verified {
return RemoteFlashResult{}, fmt.Errorf("flash verification did not succeed for %s on %s", prettyDeviceTarget(job.Device), job.Host)
}
var payload map[string]any
if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &payload); err == nil {
a.setJob(jobID, func(j *Job) {
if dest, ok := payload["dest_path"].(string); ok && dest != "" {
j.Message = fmt.Sprintf("Wrote latest artifact to %s", dest)
if result.SizeBytes > 0 {
j.Written = result.SizeBytes
j.Total = result.SizeBytes
}
if strings.TrimSpace(result.VerificationSummary) != "" {
j.Message = result.VerificationSummary
return
}
if strings.TrimSpace(result.DestPath) != "" {
j.Message = fmt.Sprintf("Verified the latest image write at %s", result.DestPath)
}
})
}
return nil
return result, nil
}
func (a *App) heartbeatRemoteJob(jobID string) {
@ -255,8 +324,8 @@ func (a *App) heartbeatRemoteJob(jobID string) {
stageStart = j.StartedAt
}
elapsed := time.Since(stageStart)
switch j.Stage {
case "build":
switch {
case j.Stage == "build":
progress, message := buildStageHeartbeat(j.Node, j.Builder, elapsed)
if progress > j.ProgressPct {
j.ProgressPct = progress
@ -264,14 +333,14 @@ func (a *App) heartbeatRemoteJob(jobID string) {
if strings.TrimSpace(message) != "" {
j.Message = message
}
case "preflight":
case j.Stage == "preflight":
if j.ProgressPct < 80 {
j.ProgressPct = 80
}
j.Message = fmt.Sprintf("Validating %s on %s and resolving the latest Harbor artifact", prettyDeviceTarget(j.Device), j.Host)
case "flash":
if j.Total > 0 && j.Written > 0 {
actual := 88 + (float64(j.Written)/float64(j.Total))*10
case isFlashStage(j.Stage):
if j.Stage == "flash_write" && j.Total > 0 && j.Written > 0 {
actual := 92 + (float64(j.Written)/float64(j.Total))*6
if actual > 98 {
actual = 98
}
@ -281,7 +350,7 @@ func (a *App) heartbeatRemoteJob(jobID string) {
j.Message = fmt.Sprintf("Writing %s of %s on %s", humanBytes(j.Written), humanBytes(j.Total), j.Host)
return
}
progress, message := flashStageHeartbeat(j.Host, j.Artifact, elapsed)
progress, message := flashStagePhaseHeartbeat(j.Stage, j.Host, j.Artifact, elapsed)
if progress > j.ProgressPct {
j.ProgressPct = progress
}

View File

@ -29,7 +29,7 @@ func TestRemoteWorkflowErrorBranches(t *testing.T) {
}
job = app.newJob("flash", "titan-15", "titan-22", "/dev/sdz")
if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil {
if _, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil {
t.Fatal("expected flashArtifact error")
}
@ -260,7 +260,7 @@ func TestFlashArtifactAndHeartbeatBranches(t *testing.T) {
app := remoteTestApp(t, nil)
job := app.newJob("replace", "titan-15", "missing-host", "/dev/sdz")
if err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") {
if _, err := app.flashArtifact(job.ID, "registry.example/metis/titan-15"); err == nil || !strings.Contains(err.Error(), "not a current cluster node") {
t.Fatalf("expected missing host flashArtifact error, got %v", err)
}
@ -283,7 +283,7 @@ func TestFlashArtifactAndHeartbeatBranches(t *testing.T) {
}
app.setJob(job.ID, func(j *Job) {
j.Stage = "flash"
j.Stage = "flash_write"
j.ProgressPct = 80
j.Written = 120
j.Total = 100
@ -341,7 +341,7 @@ func remoteWorkflowKubeServer(t *testing.T, opts remoteKubeOptions) *httptest.Se
buildPhase := defaultString(opts.buildPhase, "Succeeded")
buildMessage := defaultString(opts.buildMessage, `{"local_path":"/workspace/build/titan-15.img.xz","compressed":true,"size_bytes":1234,"build_tag":"build-1"}`)
flashPhase := defaultString(opts.flashPhase, "Succeeded")
flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img"}`)
flashMessage := defaultString(opts.flashMessage, `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /var/tmp/metis-flash-test/titan-15.img; boot and writable partitions are present."}`)
nodes := opts.nodes
if nodes == nil {
nodes = []map[string]any{

View File

@ -28,7 +28,21 @@ func buildStageHeartbeat(node, builder string, elapsed time.Duration) (float64,
}
func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64, string) {
return flashStagePhaseHeartbeat("flash", host, artifact, elapsed)
}
func flashStagePhaseHeartbeat(stage, host, artifact string, elapsed time.Duration) (float64, string) {
seconds := elapsed.Seconds()
switch strings.TrimSpace(stage) {
case "flash_pull":
return math.Min(88, ramp(seconds, 0, 12, 84, 88)), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
case "flash_prepare", "flash_unpack":
return math.Min(92, ramp(seconds, 0, 30, 88, 92)), fmt.Sprintf("Preparing the latest %s artifact for writing on %s", artifact, host)
case "flash_flush":
return math.Min(99, ramp(seconds, 0, 30, 98, 99)), fmt.Sprintf("Flushing the finished image write on %s", host)
case "flash_verify":
return math.Min(99.8, ramp(seconds, 0, 30, 99, 99.8)), fmt.Sprintf("Verifying the flashed recovery media on %s", host)
default:
switch {
case seconds < 10:
return ramp(seconds, 0, 10, 84, 88), fmt.Sprintf("Pulling %s from Harbor on %s", artifact, host)
@ -38,6 +52,11 @@ func flashStageHeartbeat(host, artifact string, elapsed time.Duration) (float64,
return math.Min(98, ramp(seconds, 45, 120, 96, 98)), fmt.Sprintf("Flushing buffers and finishing the write on %s", host)
}
}
}
func isFlashStage(stage string) bool {
return strings.HasPrefix(strings.TrimSpace(stage), "flash")
}
func prettyDeviceTarget(path string) string {
switch {

View File

@ -384,3 +384,31 @@ func TestSelectBuilderHostTieBreaksByName(t *testing.T) {
t.Fatalf("expected alphabetical tie-breaker, got %s", node.Name)
}
}
func TestFlashPhaseHeartbeatAndManagedPathHelpers(t *testing.T) {
cases := []struct {
stage string
minimum float64
phrase string
}{
{stage: "flash_pull", minimum: 84, phrase: "Pulling"},
{stage: "flash_prepare", minimum: 88, phrase: "Preparing"},
{stage: "flash_unpack", minimum: 90, phrase: "Preparing"},
{stage: "flash_flush", minimum: 98, phrase: "Flushing"},
{stage: "flash_verify", minimum: 99, phrase: "Verifying"},
{stage: "flash", minimum: 90, phrase: "Writing"},
}
for _, tc := range cases {
got, msg := flashStagePhaseHeartbeat(tc.stage, "titan-22", "registry.example/metis/titan-15:latest", 20*time.Second)
if got < tc.minimum || !strings.Contains(msg, tc.phrase) {
t.Fatalf("flashStagePhaseHeartbeat(%s) = %v %q", tc.stage, got, msg)
}
}
if !managedPathsContain("/var/log/pods_/var/tmp", "/var/tmp") {
t.Fatal("expected managedPathsContain to match child paths")
}
if managedPathsContain("/var/log/pods", "/home/atlas") {
t.Fatal("managedPathsContain should reject unrelated paths")
}
}

View File

@ -19,6 +19,24 @@ type RemoteProgressUpdate struct {
TotalBytes int64 `json:"total_bytes,omitempty"`
}
// RemoteFlashResult captures the verified outcome of a remote flash worker run.
type RemoteFlashResult struct {
Node string `json:"node,omitempty"`
Device string `json:"device,omitempty"`
DestPath string `json:"dest_path,omitempty"`
SizeBytes int64 `json:"size_bytes,omitempty"`
Verified bool `json:"verified,omitempty"`
VerificationKind string `json:"verification_kind,omitempty"`
VerificationSummary string `json:"verification_summary,omitempty"`
BootPartition string `json:"boot_partition,omitempty"`
RootPartition string `json:"root_partition,omitempty"`
BootLabel string `json:"boot_label,omitempty"`
RootLabel string `json:"root_label,omitempty"`
BootFSType string `json:"boot_fstype,omitempty"`
RootFSType string `json:"root_fstype,omitempty"`
CheckedFiles []string `json:"checked_files,omitempty"`
}
// ProgressLogLine formats a progress update for remote worker stdout.
func ProgressLogLine(update RemoteProgressUpdate) string {
data, err := json.Marshal(update)

View File

@ -30,6 +30,7 @@ func (a *App) Handler() http.Handler {
mux.HandleFunc("/api/state", a.withUIAuth(a.handleState))
mux.HandleFunc("/api/devices", a.withUIAuth(a.handleDevices))
mux.HandleFunc("/api/jobs/build", a.withUIAuth(a.handleBuild))
mux.HandleFunc("/api/jobs/flash", a.withUIAuth(a.handleFlash))
mux.HandleFunc("/api/jobs/replace", a.withUIAuth(a.handleReplace))
mux.HandleFunc("/api/sentinel/watch", a.withUIAuth(a.handleWatch))
mux.HandleFunc("/", a.withUIAuth(a.handleIndex))
@ -106,6 +107,23 @@ func (a *App) handleBuild(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusAccepted, job)
}
func (a *App) handleFlash(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
values := requestValues(r)
node := values["node"]
host := values["host"]
device := values["device"]
job, err := a.Flash(node, host, device)
if err != nil {
http.Error(w, err.Error(), statusForJobError(err))
return
}
writeJSON(w, http.StatusAccepted, job)
}
func (a *App) handleReplace(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)

View File

@ -174,7 +174,7 @@
}
.actions{
display:grid;
grid-template-columns:repeat(3,minmax(0,1fr));
grid-template-columns:repeat(2,minmax(0,1fr));
gap:.7rem;
margin-top:.9rem;
}
@ -277,7 +277,7 @@
<div class="stack">
<article class="card">
<h2>Replacement Run</h2>
<p class="hint">This UI is meant for the one-shot recovery path: build the node image, verify the card on the flash host, then write it and hand off only the physical swap.</p>
<p class="hint">Use the mode that matches the recovery step you are in: build a fresh image, flash the latest published image, or run the full build-and-flash path in one shot.</p>
<div class="form-grid">
<label>Target node
<select id="node-select"></select>
@ -296,6 +296,7 @@
<div class="actions">
<button class="secondary" id="refresh-devices">Refresh media</button>
<button class="secondary" id="build-only">Build image only</button>
<button class="secondary" id="flash-only">Flash latest image</button>
<button id="replace-run">Build and flash</button>
</div>
</article>
@ -395,6 +396,22 @@
return seconds + 's';
}
function prettyLabel(value){
return String(value || '')
.replaceAll('_', ' ')
.replace(/\b\w/g, (match)=>match.toUpperCase());
}
function prettyJobKind(kind){
switch(kind){
case 'build': return 'Build';
case 'flash': return 'Flash Latest';
case 'replace': return 'Build + Flash';
case 'idle': return 'Idle';
default: return prettyLabel(kind);
}
}
function banner(kind, title, text){
bannerEl.className = 'banner ' + kind;
bannerTitleEl.textContent = title;
@ -445,10 +462,10 @@
const wrap = document.createElement('div');
wrap.className = 'item';
const statusClass = job.status === 'error' ? 'error' : (job.status === 'done' ? 'done' : (job.status === 'running' ? 'running' : ''));
const title = job.kind.toUpperCase() + (job.node ? ' · ' + job.node : '');
const title = prettyJobKind(job.kind) + (job.node ? ' · ' + job.node : '');
const started = fmtTime(job.started_at) + (job.device ? ' · ' + job.device : '') + (job.host ? ' · ' + job.host : '');
const timingBits = [];
if(job.stage){ timingBits.push('stage: ' + job.stage); }
if(job.stage){ timingBits.push('stage: ' + prettyLabel(job.stage)); }
const stageDuration = fmtDuration(job.stage_started_at || job.started_at, job.finished_at);
if(stageDuration){
timingBits.push((job.status === 'running' ? 'stage elapsed ' : 'stage duration ') + stageDuration);
@ -538,7 +555,7 @@
}
const selectedHost = hostSelect.value || state.default_flash_host;
hostNoteEl.textContent = 'Metis will inspect media and run the flash writer on ' + selectedHost + ' through a short-lived in-cluster worker. ' + state.default_flash_host + ' remains the default flash host.';
hostNoteEl.textContent = 'Metis will inspect media and run the flash writer/verifier on ' + selectedHost + ' through a short-lived in-cluster worker. ' + state.default_flash_host + ' remains the default flash host.';
if(state.device_error){
deviceNoteEl.textContent = state.device_error;
@ -549,11 +566,13 @@
}
const artifact = (state.artifacts || {})[nodeSelect.value];
artifactNoteEl.textContent = artifact && artifact.ref
? 'Latest published image: ' + artifact.ref + ' (Metis keeps the newest 3 builds in Harbor).'
: 'Successful build-only runs publish <node>:latest into Harbor and keep the newest 3 builds per node.';
const hasArtifact = !!(artifact && artifact.ref);
artifactNoteEl.textContent = hasArtifact
? 'Latest published image: ' + artifact.ref + (artifact.build_tag ? ' · build ' + artifact.build_tag : '') + '. Flash latest writes this image directly; build and flash rebuilds first.'
: 'Build image only publishes <node>:latest into Harbor. Flash latest stays disabled until Metis has a recorded latest image.';
document.getElementById('build-only').disabled = busy || !nodeSelect.value;
document.getElementById('flash-only').disabled = busy || !nodeSelect.value || !deviceSelect.value || !!state.device_error || !hasArtifact;
document.getElementById('refresh-devices').disabled = busy;
document.getElementById('replace-run').disabled = busy || !nodeSelect.value || !deviceSelect.value || !!state.device_error;
document.getElementById('sentinel-watch').disabled = busy;
@ -654,6 +673,24 @@
});
});
document.getElementById('flash-only').addEventListener('click', async ()=>{
if(!requireValue(nodeSelect.value, 'Choose the target node whose latest published image should be flashed.')){
return;
}
if(!requireValue(deviceSelect.value, 'Choose removable media before starting a flash-only run.')){
return;
}
if(state.device_error){
banner('error', 'Flash host unavailable', state.device_error);
return;
}
await runAction('Starting flash latest', 'Queueing a flash-only run from the latest published Harbor image.', async ()=>{
await post('/api/jobs/flash', {node: nodeSelect.value, host: hostSelect.value, device: deviceSelect.value});
await refreshState({silent:true});
banner('success', 'Flash-only workflow queued', 'Metis is flashing the latest published image for ' + nodeSelect.value + ' onto ' + deviceSelect.value + ' and will verify it before finishing.');
});
});
document.getElementById('replace-run').addEventListener('click', async ()=>{
if(!requireValue(nodeSelect.value, 'Choose the target node whose SD card image should be built and flashed.')){
return;
@ -668,7 +705,7 @@
await runAction('Starting build and flash', 'Queueing the full replacement workflow now.', async ()=>{
await post('/api/jobs/replace', {node: nodeSelect.value, host: hostSelect.value, device: deviceSelect.value});
await refreshState({silent:true});
banner('success', 'Replacement workflow queued', 'Metis is building the image for ' + nodeSelect.value + ' and will flash ' + deviceSelect.value + '.');
banner('success', 'Replacement workflow queued', 'Metis is rebuilding the image for ' + nodeSelect.value + ', flashing ' + deviceSelect.value + ', and verifying the media before finishing.');
});
});

View File

@ -185,7 +185,7 @@ func fakeKubeServer(t *testing.T) *httptest.Server {
case strings.Contains(podName, "build"):
message = `{"local_path":"/workspace/build/titan-15.img.xz","compressed":true,"size_bytes":1234,"build_tag":"build-1"}`
case strings.Contains(podName, "flash"):
message = `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img"}`
message = `{"dest_path":"/var/tmp/metis-flash-test/titan-15.img","verified":true,"verification_kind":"image-file","verification_summary":"Verified image layout at /var/tmp/metis-flash-test/titan-15.img; boot and writable partitions are present."}`
}
_ = json.NewEncoder(w).Encode(map[string]any{
"metadata": map[string]any{"name": podName},

View File

@ -57,4 +57,4 @@ if [ "${test_rc}" -ne 0 ]; then
fi
cd "${repo_root}/testing"
METIS_USE_EXISTING_COVERAGE=1 go test -v ./...
METIS_USE_EXISTING_COVERAGE=1 go test -count=1 -v ./...

View File

@ -8,6 +8,7 @@
"metis/cmd/metis/inject_cmd.go": 95,
"metis/cmd/metis/main.go": 95,
"metis/cmd/metis/remote_cmd.go": 95,
"metis/cmd/metis/remote_flash.go": 95,
"metis/cmd/metis/serve_cmd.go": 95,
"metis/pkg/config/config.go": 95,
"metis/pkg/facts/aggregate.go": 95,