From a148e77335058bf61e08315ea0f63b81105171f1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 31 Mar 2026 20:42:35 -0300 Subject: [PATCH] service: run remote build and flash workflows --- Dockerfile | 6 +- Jenkinsfile | 2 +- cmd/metis/main.go | 8 +- cmd/metis/remote_cmd.go | 349 ++++++++++++++++++++++++++++++ go.mod | 29 ++- go.sum | 58 +++++ pkg/service/app.go | 399 +++------------------------------- pkg/service/artifacts.go | 52 +++++ pkg/service/cluster.go | 278 ++++++++++++++++++++++++ pkg/service/harbor.go | 131 +++++++++++ pkg/service/remote.go | 455 +++++++++++++++++++++++++++++++++++++++ pkg/service/server.go | 13 +- pkg/service/settings.go | 68 +++--- 13 files changed, 1443 insertions(+), 405 deletions(-) create mode 100644 cmd/metis/remote_cmd.go create mode 100644 pkg/service/artifacts.go create mode 100644 pkg/service/cluster.go create mode 100644 pkg/service/harbor.go create mode 100644 pkg/service/remote.go diff --git a/Dockerfile b/Dockerfile index c01796d..566308b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ ARG TARGETPLATFORM ARG TARGETOS ARG TARGETARCH -FROM --platform=$BUILDPLATFORM golang:1.22-bookworm AS build +FROM --platform=$BUILDPLATFORM golang:1.23-bookworm AS build ARG TARGETOS ARG TARGETARCH @@ -17,7 +17,8 @@ COPY . . RUN --mount=type=cache,target=/root/.cache/go-build \ --mount=type=cache,target=/go/pkg/mod \ CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o /out/metis ./cmd/metis && \ - CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o /out/metis-sentinel ./cmd/metis-sentinel + CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o /out/metis-sentinel ./cmd/metis-sentinel && \ + CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o /out/oras oras.land/oras/cmd/oras FROM debian:bookworm-slim AS runtime-base @@ -28,6 +29,7 @@ RUN apt-get update \ WORKDIR /app COPY --from=build /out/metis /usr/local/bin/metis COPY --from=build /out/metis-sentinel /usr/local/bin/metis-sentinel +COPY --from=build /out/oras /usr/local/bin/oras COPY inventory.example.yaml /app/inventory.example.yaml COPY inventory.titan-rpi4.yaml /app/inventory.titan-rpi4.yaml COPY overlays /app/overlays diff --git a/Jenkinsfile b/Jenkinsfile index 01eb234..dade2a9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -49,7 +49,7 @@ spec: - name: harbor-config mountPath: /docker-config - name: tester - image: golang:1.22-bookworm + image: golang:1.23-bookworm command: ["cat"] tty: true volumeMounts: diff --git a/cmd/metis/main.go b/cmd/metis/main.go index 2a6ca71..c65a9c3 100644 --- a/cmd/metis/main.go +++ b/cmd/metis/main.go @@ -32,6 +32,12 @@ func main() { configCmd(os.Args[2:]) case "facts": factsCmd(os.Args[2:]) + case "remote-devices": + remoteDevicesCmd(os.Args[2:]) + case "remote-build": + remoteBuildCmd(os.Args[2:]) + case "remote-flash": + remoteFlashCmd(os.Args[2:]) default: usage() os.Exit(1) @@ -39,7 +45,7 @@ func main() { } func usage() { - fmt.Fprintf(os.Stderr, "Usage: metis [options]\n") + fmt.Fprintf(os.Stderr, "Usage: metis [options]\n") } func loadInventory(path string) *inventory.Inventory { diff --git a/cmd/metis/remote_cmd.go b/cmd/metis/remote_cmd.go new file mode 100644 index 0000000..f02a01e --- /dev/null +++ b/cmd/metis/remote_cmd.go @@ -0,0 +1,349 @@ +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "sort" + "strconv" + "strings" + "time" + + "metis/pkg/plan" + "metis/pkg/service" + "metis/pkg/writer" +) + +func remoteDevicesCmd(args []string) { + fs := flag.NewFlagSet("remote-devices", flag.ExitOnError) + maxBytes := fs.Int64("max-device-bytes", 300000000000, "max real removable device size") + hostTmpDir := fs.String("host-tmp-dir", "/tmp/metis-flash-test", "host tmp dir for test writes") + fs.Parse(args) + + devices, err := localFlashDevices(*maxBytes, *hostTmpDir) + if err != nil { + log.Fatalf("remote devices: %v", err) + } + sort.Slice(devices, func(i, j int) bool { + left := localDeviceScore(devices[i]) + right := localDeviceScore(devices[j]) + if left != right { + return left > right + } + if devices[i].SizeBytes != devices[j].SizeBytes { + return devices[i].SizeBytes < devices[j].SizeBytes + } + return devices[i].Path < devices[j].Path + }) + + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + _ = enc.Encode(map[string]any{"devices": devices}) +} + +func remoteBuildCmd(args []string) { + fs := flag.NewFlagSet("remote-build", flag.ExitOnError) + invPath := fs.String("inventory", "inventory.yaml", "inventory file") + node := fs.String("node", "", "target node") + cacheDir := fs.String("cache", filepath.Join(os.TempDir(), "metis-cache"), "image cache dir") + workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-work"), "working directory") + artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag") + buildTag := fs.String("build-tag", "", "artifact build tag") + harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host") + harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username") + harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password") + fs.Parse(args) + if *node == "" || *artifactRef == "" || *buildTag == "" { + log.Fatalf("--node, --artifact-ref, and --build-tag are required") + } + + if err := os.MkdirAll(*workDir, 0o755); err != nil { + log.Fatalf("mkdir workdir: %v", err) + } + output := filepath.Join(*workDir, fmt.Sprintf("%s.img", *node)) + inv := loadInventory(*invPath) + if err := plan.BuildImageFile(context.Background(), inv, *node, *cacheDir, output); err != nil { + log.Fatalf("build image: %v", err) + } + if err := exec.Command("xz", "-T0", "-z", "-f", output).Run(); err != nil { + log.Fatalf("xz compress: %v", err) + } + compressedPath := output + ".xz" + info, err := os.Stat(compressedPath) + if err != nil { + log.Fatalf("stat compressed image: %v", err) + } + + metadataPath := filepath.Join(*workDir, "metadata.json") + builtAt := time.Now().UTC() + meta := map[string]any{ + "node": *node, + "artifact_ref": *artifactRef, + "build_tag": *buildTag, + "built_at": builtAt.Format(time.RFC3339), + "size_bytes": info.Size(), + "compressed": true, + } + metaBytes, err := json.MarshalIndent(meta, "", " ") + if err != nil { + log.Fatalf("encode metadata: %v", err) + } + if err := os.WriteFile(metadataPath, metaBytes, 0o644); err != nil { + log.Fatalf("write metadata: %v", err) + } + if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil { + log.Fatalf("oras login: %v", err) + } + taggedRef := fmt.Sprintf("%s:%s", *artifactRef, *buildTag) + if err := orasPush(taggedRef, compressedPath, metadataPath); err != nil { + log.Fatalf("oras push: %v", err) + } + if err := orasTag(taggedRef, "latest"); err != nil { + log.Fatalf("oras tag latest: %v", err) + } + + summary := service.ArtifactSummary{ + Node: *node, + Ref: fmt.Sprintf("%s:latest", *artifactRef), + BuildTag: *buildTag, + LocalPath: compressedPath, + Compressed: true, + UpdatedAt: builtAt, + SizeBytes: info.Size(), + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + _ = enc.Encode(summary) +} + +func remoteFlashCmd(args []string) { + fs := flag.NewFlagSet("remote-flash", flag.ExitOnError) + node := fs.String("node", "", "target node") + device := fs.String("device", "", "target device path or test sink") + artifactRef := fs.String("artifact-ref", "", "harbor artifact ref without tag") + workDir := fs.String("work-dir", filepath.Join(os.TempDir(), "metis-flash"), "working directory") + harborRegistry := fs.String("harbor-registry", getenvOr("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), "harbor registry host") + harborUsername := fs.String("harbor-username", getenvOr("METIS_HARBOR_USERNAME", ""), "harbor username") + harborPassword := fs.String("harbor-password", getenvOr("METIS_HARBOR_PASSWORD", ""), "harbor password") + hostTmpDir := fs.String("host-tmp-dir", "/host-tmp/metis-flash-test", "mounted host tmp dir for test writes") + fs.Parse(args) + if *node == "" || *device == "" || *artifactRef == "" { + log.Fatalf("--node, --device, and --artifact-ref are required") + } + + if err := os.MkdirAll(*workDir, 0o755); err != nil { + log.Fatalf("mkdir workdir: %v", err) + } + if err := orasLogin(*harborRegistry, *harborUsername, *harborPassword); err != nil { + log.Fatalf("oras login: %v", err) + } + if err := orasPull(fmt.Sprintf("%s:latest", *artifactRef), *workDir); err != nil { + log.Fatalf("oras pull: %v", err) + } + imagePath, compressed, err := resolvePulledArtifact(*workDir) + if err != nil { + log.Fatalf("resolve artifact: %v", err) + } + rawImage := imagePath + if compressed { + rawImage = filepath.Join(*workDir, fmt.Sprintf("%s.img", *node)) + cmd := exec.Command("sh", "-lc", fmt.Sprintf("xz -dc '%s' > '%s'", imagePath, rawImage)) + if out, err := cmd.CombinedOutput(); err != nil { + log.Fatalf("xz stream decompress: %v: %s", err, strings.TrimSpace(string(out))) + } + } + + destPath := *device + if strings.HasPrefix(destPath, "hosttmp://") { + if err := os.MkdirAll(*hostTmpDir, 0o755); err != nil { + log.Fatalf("mkdir host tmp dir: %v", err) + } + destPath = filepath.Join(*hostTmpDir, fmt.Sprintf("%s.img", *node)) + } + if err := writer.WriteImage(context.Background(), rawImage, destPath); err != nil { + log.Fatalf("write image: %v", err) + } + _ = exec.Command("sync").Run() + if strings.HasPrefix(destPath, "/dev/") { + _ = exec.Command("blockdev", "--flushbufs", destPath).Run() + } + + info, err := os.Stat(destPath) + if err != nil { + log.Fatalf("stat destination: %v", err) + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + _ = enc.Encode(map[string]any{ + "node": *node, + "device": *device, + "dest_path": destPath, + "size_bytes": info.Size(), + }) +} + +func localFlashDevices(maxBytes int64, hostTmpDir string) ([]service.Device, error) { + cmd := exec.Command("lsblk", "-J", "-b", "-o", "NAME,PATH,RM,HOTPLUG,SIZE,MODEL,TRAN,TYPE") + out, err := cmd.Output() + if err != nil { + return nil, err + } + var payload struct { + Blockdevices []struct { + Name string `json:"name"` + Path string `json:"path"` + RM bool `json:"rm"` + Hotplug bool `json:"hotplug"` + Size any `json:"size"` + Model string `json:"model"` + Tran string `json:"tran"` + Type string `json:"type"` + } `json:"blockdevices"` + } + if err := json.Unmarshal(out, &payload); err != nil { + return nil, err + } + devices := make([]service.Device, 0, len(payload.Blockdevices)+1) + for _, dev := range payload.Blockdevices { + if dev.Type != "disk" { + continue + } + size := int64(0) + switch value := dev.Size.(type) { + case string: + size, _ = strconv.ParseInt(value, 10, 64) + case float64: + size = int64(value) + } + if size <= 0 || size > maxBytes { + continue + } + if dev.Tran != "usb" && !dev.RM && !dev.Hotplug { + continue + } + devices = append(devices, service.Device{ + Name: dev.Name, + Path: dev.Path, + Model: strings.TrimSpace(dev.Model), + Transport: dev.Tran, + Type: dev.Type, + Removable: dev.RM, + Hotplug: dev.Hotplug, + SizeBytes: size, + }) + } + devices = append(devices, service.Device{ + Name: "host-tmp", + Path: "hosttmp:///tmp", + Model: "Host /tmp", + Transport: "test", + Type: "file", + Note: fmt.Sprintf("Test-only host write target under %s", hostTmpDir), + Removable: false, + Hotplug: false, + SizeBytes: 1, + }) + return devices, nil +} + +func localDeviceScore(device service.Device) int { + score := 0 + if strings.HasPrefix(device.Path, "hosttmp://") { + return -100 + } + if device.Transport == "usb" { + score += 50 + } + if device.Removable { + score += 30 + } + if device.Hotplug { + score += 20 + } + if strings.Contains(strings.ToLower(device.Model), "sd") { + score += 10 + } + return score +} + +func orasLogin(registry, username, password string) error { + if strings.TrimSpace(username) == "" || strings.TrimSpace(password) == "" { + return fmt.Errorf("harbor credentials missing") + } + cmd := exec.Command("oras", "login", registry, "-u", username, "-p", password) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out))) + } + return nil +} + +func orasPush(ref, imagePath, metadataPath string) error { + cmd := exec.Command("oras", "push", ref, + fmt.Sprintf("%s:application/x-raw-disk-image", imagePath), + fmt.Sprintf("%s:application/json", metadataPath), + ) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out))) + } + return nil +} + +func orasTag(ref string, tags ...string) error { + args := append([]string{"tag", ref}, tags...) + cmd := exec.Command("oras", args...) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out))) + } + return nil +} + +func orasPull(ref, outDir string) error { + cmd := exec.Command("oras", "pull", ref, "-o", outDir) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out))) + } + return nil +} + +func resolvePulledArtifact(dir string) (string, bool, error) { + var rawPath string + var compressedPath string + err := filepath.WalkDir(dir, func(path string, d os.DirEntry, walkErr error) error { + if walkErr != nil { + return walkErr + } + if d.IsDir() { + return nil + } + switch { + case strings.HasSuffix(path, ".img.xz"): + compressedPath = path + case strings.HasSuffix(path, ".img"): + rawPath = path + } + return nil + }) + if err != nil { + return "", false, err + } + if compressedPath != "" { + return compressedPath, true, nil + } + if rawPath != "" { + return rawPath, false, nil + } + return "", false, fmt.Errorf("no .img or .img.xz artifact found in %s", dir) +} + +func getenvOr(key, fallback string) string { + value := strings.TrimSpace(os.Getenv(key)) + if value == "" { + return fallback + } + return value +} diff --git a/go.mod b/go.mod index 12a75ea..b3985f7 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,32 @@ module metis -go 1.22.0 +go 1.23.1 require gopkg.in/yaml.v3 v3.0.1 + +require ( + dario.cat/mergo v1.0.1 // indirect + github.com/Masterminds/goutils v1.1.1 // indirect + github.com/Masterminds/semver/v3 v3.3.0 // indirect + github.com/Masterminds/sprig/v3 v3.3.0 // indirect + github.com/containerd/console v1.0.4 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/huandu/xstrings v1.5.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/mitchellh/copystructure v1.2.0 // indirect + github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/morikuni/aec v1.0.0 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect + github.com/shopspring/decimal v1.4.0 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect + github.com/spf13/cast v1.7.0 // indirect + github.com/spf13/cobra v1.8.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + golang.org/x/crypto v0.31.0 // indirect + golang.org/x/sync v0.10.0 // indirect + golang.org/x/sys v0.28.0 // indirect + golang.org/x/term v0.27.0 // indirect + oras.land/oras v1.2.2 // indirect + oras.land/oras-go/v2 v2.5.0 // indirect +) diff --git a/go.sum b/go.sum index a62c313..e31dcf6 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,62 @@ +dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= +dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= +github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= +github.com/Masterminds/semver/v3 v3.3.0 h1:B8LGeaivUe71a5qox1ICM/JLl0NqZSW5CHyL+hmvYS0= +github.com/Masterminds/semver/v3 v3.3.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs= +github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= +github.com/containerd/console v1.0.4 h1:F2g4+oChYvBTsASRTz8NP6iIAi97J3TtSAsLbIFn4ro= +github.com/containerd/console v1.0.4/go.mod h1:YynlIjWYF8myEu6sdkwKIvGQq+cOckRm6So2avqoYAk= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/huandu/xstrings v1.5.0 h1:2ag3IFq9ZDANvthTwTiqSSZLjDc+BedvHPAp5tJy2TI= +github.com/huandu/xstrings v1.5.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= +github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= +github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= +github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= +github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= +github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w= +github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +oras.land/oras v1.2.2 h1:TyeLkSI1D4RltfA0alTkBNa5ttMwmpUJMGc97QIdmuw= +oras.land/oras v1.2.2/go.mod h1:qtLROGNZulPzlI/pAr9s6j41IeVYQF1VAm+KRU+vkB4= +oras.land/oras-go/v2 v2.5.0 h1:o8Me9kLY74Vp5uw07QXPiitjsw7qNXi8Twd+19Zf02c= +oras.land/oras-go/v2 v2.5.0/go.mod h1:z4eisnLP530vwIOUOJeBIj0aGI0L1C3d53atvCBqZHg= diff --git a/pkg/service/app.go b/pkg/service/app.go index cec0b8d..d252f8f 100644 --- a/pkg/service/app.go +++ b/pkg/service/app.go @@ -2,29 +2,20 @@ package service import ( "bufio" - "context" - "crypto/tls" - "crypto/x509" "encoding/json" "errors" "fmt" - "io" - "net/http" "os" "os/exec" "path/filepath" "sort" - "strconv" "strings" "sync" "time" "metis/pkg/facts" - "metis/pkg/image" "metis/pkg/inventory" - "metis/pkg/plan" "metis/pkg/sentinel" - "metis/pkg/writer" ) type JobStatus string @@ -43,6 +34,7 @@ type Device struct { Model string `json:"model,omitempty"` Transport string `json:"transport,omitempty"` Type string `json:"type,omitempty"` + Note string `json:"note,omitempty"` Removable bool `json:"removable"` Hotplug bool `json:"hotplug"` SizeBytes int64 `json:"size_bytes"` @@ -54,6 +46,7 @@ type Job struct { Kind string `json:"kind"` Node string `json:"node,omitempty"` Host string `json:"host,omitempty"` + Builder string `json:"builder,omitempty"` Device string `json:"device,omitempty"` Status JobStatus `json:"status"` Stage string `json:"stage,omitempty"` @@ -101,9 +94,15 @@ type PageState struct { // ArtifactSummary describes the latest built image for a node. type ArtifactSummary struct { - Path string `json:"path"` - UpdatedAt time.Time `json:"updated_at"` - SizeBytes int64 `json:"size_bytes"` + Node string `json:"node,omitempty"` + Ref string `json:"ref,omitempty"` + BuildTag string `json:"build_tag,omitempty"` + LocalPath string `json:"local_path,omitempty"` + HostPath string `json:"host_path,omitempty"` + BuilderHost string `json:"builder_host,omitempty"` + Compressed bool `json:"compressed,omitempty"` + UpdatedAt time.Time `json:"updated_at"` + SizeBytes int64 `json:"size_bytes"` } // App coordinates builds, flashes, sentinel snapshots, and the web UI state. @@ -112,10 +111,11 @@ type App struct { inventory *inventory.Inventory metrics *Metrics - mu sync.RWMutex - jobs map[string]*Job - snapshots map[string]SnapshotRecord - targets map[string]facts.Targets + mu sync.RWMutex + jobs map[string]*Job + snapshots map[string]SnapshotRecord + targets map[string]facts.Targets + artifactStore map[string]ArtifactSummary } // NewApp creates a Metis service app instance. @@ -134,15 +134,17 @@ func NewApp(settings Settings) (*App, error) { return nil, err } app := &App{ - settings: settings, - inventory: inv, - metrics: NewMetrics(), - jobs: map[string]*Job{}, - snapshots: map[string]SnapshotRecord{}, - targets: map[string]facts.Targets{}, + settings: settings, + inventory: inv, + metrics: NewMetrics(), + jobs: map[string]*Job{}, + snapshots: map[string]SnapshotRecord{}, + targets: map[string]facts.Targets{}, + artifactStore: map[string]ArtifactSummary{}, } _ = app.loadSnapshots() _ = app.loadTargets() + _ = app.loadArtifacts() return app, nil } @@ -302,231 +304,6 @@ func (a *App) WatchSentinel() (*Event, error) { return event, nil } -// ListDevices returns locally attached removable media that are safe candidates for flashing. -func (a *App) ListDevices(host string) ([]Device, error) { - if host == "" { - host = a.settings.DefaultFlashHost - } - if !a.supportsLocalMedia(host) { - return nil, fmt.Errorf("flash host %s is listed for planning, but this Metis instance only has direct removable-media access on %s", host, a.settings.LocalHost) - } - cmd := exec.Command("lsblk", "-J", "-b", "-o", "NAME,PATH,RM,HOTPLUG,SIZE,MODEL,TRAN,TYPE") - out, err := cmd.Output() - if err != nil { - return nil, err - } - var payload struct { - Blockdevices []struct { - Name string `json:"name"` - Path string `json:"path"` - RM bool `json:"rm"` - Hotplug bool `json:"hotplug"` - Size any `json:"size"` - Model string `json:"model"` - Tran string `json:"tran"` - Type string `json:"type"` - } `json:"blockdevices"` - } - if err := json.Unmarshal(out, &payload); err != nil { - return nil, err - } - devices := make([]Device, 0) - for _, dev := range payload.Blockdevices { - if dev.Type != "disk" { - continue - } - size := int64(0) - switch value := dev.Size.(type) { - case string: - size, _ = strconv.ParseInt(value, 10, 64) - case float64: - size = int64(value) - } - if size <= 0 || size > a.settings.MaxDeviceBytes { - continue - } - if dev.Tran != "usb" && !dev.RM && !dev.Hotplug { - continue - } - devices = append(devices, Device{ - Name: dev.Name, - Path: dev.Path, - Model: strings.TrimSpace(dev.Model), - Transport: dev.Tran, - Type: dev.Type, - Removable: dev.RM, - Hotplug: dev.Hotplug, - SizeBytes: size, - }) - } - sort.Slice(devices, func(i, j int) bool { - left := deviceScore(devices[i]) - right := deviceScore(devices[j]) - if left != right { - return left > right - } - if devices[i].SizeBytes != devices[j].SizeBytes { - return devices[i].SizeBytes < devices[j].SizeBytes - } - return devices[i].Path < devices[j].Path - }) - return devices, nil -} - -func (a *App) runBuild(job *Job, flash bool) { - a.setJob(job.ID, func(j *Job) { - j.Status = JobRunning - j.Stage = "download" - j.Message = "Fetching and verifying base image" - j.ProgressPct = 5 - }) - output := a.artifactPath(job.Node) - cacheDir := a.settings.CacheDir - - planData, err := plan.Build(a.inventory, job.Node, output, cacheDir) - if err != nil { - a.failJob(job.ID, err) - a.metrics.RecordBuild(job.Node, "error") - return - } - _, class, err := a.inventory.FindNode(job.Node) - if err != nil { - a.failJob(job.ID, err) - a.metrics.RecordBuild(job.Node, "error") - return - } - cacheImage := filepath.Join(cacheDir, cachedImageName(planData.Image)) - cacheImage, err = image.DownloadAndVerify(planData.Image, cacheImage, class.Checksum) - if err != nil { - a.failJob(job.ID, err) - a.metrics.RecordBuild(job.Node, "error") - return - } - a.setJob(job.ID, func(j *Job) { - j.Stage = "copy" - j.Message = "Copying base image into artifact" - j.ProgressPct = 24 - }) - if err := writer.WriteImage(context.Background(), cacheImage, output); err != nil { - a.failJob(job.ID, err) - a.metrics.RecordBuild(job.Node, "error") - return - } - files, err := plan.Files(a.inventory, job.Node) - if err != nil { - a.failJob(job.ID, err) - a.metrics.RecordBuild(job.Node, "error") - return - } - a.setJob(job.ID, func(j *Job) { - j.Stage = "inject" - j.Message = "Injecting node-specific rootfs config" - j.ProgressPct = 70 - }) - if err := image.InjectRootFS(output, files); err != nil { - a.failJob(job.ID, err) - a.metrics.RecordBuild(job.Node, "error") - return - } - a.metrics.RecordBuild(job.Node, "ok") - a.appendEvent(Event{ - Time: time.Now().UTC(), - Kind: "image.build", - Summary: fmt.Sprintf("Built replacement image for %s", job.Node), - Details: map[string]any{"node": job.Node, "artifact": output}, - }) - - if !flash { - a.completeJob(job.ID, func(j *Job) { - j.Stage = "complete" - j.Message = "Image build complete" - j.ProgressPct = 100 - j.Artifact = output - }) - return - } - - a.setJob(job.ID, func(j *Job) { - j.Stage = "preflight" - j.Message = "Validating device and deleting stale node object" - j.ProgressPct = 78 - j.Artifact = output - }) - if _, err := a.ensureDevice(job.Host, job.Device); err != nil { - a.failJob(job.ID, err) - a.metrics.RecordFlash(job.Node, job.Host, "error") - return - } - if err := deleteNodeObject(job.Node); err != nil { - a.appendEvent(Event{ - Time: time.Now().UTC(), - Kind: "node.delete.warning", - Summary: fmt.Sprintf("Could not delete stale Kubernetes node object for %s", job.Node), - Details: map[string]any{"node": job.Node, "error": err.Error()}, - }) - } - if err := a.flashArtifact(job.ID, output); err != nil { - a.failJob(job.ID, err) - a.metrics.RecordFlash(job.Node, job.Host, "error") - return - } - a.metrics.RecordFlash(job.Node, job.Host, "ok") - a.appendEvent(Event{ - Time: time.Now().UTC(), - Kind: "image.flash", - Summary: fmt.Sprintf("Flashed %s image to %s on %s", job.Node, job.Device, job.Host), - Details: map[string]any{"node": job.Node, "device": job.Device, "host": job.Host}, - }) - a.completeJob(job.ID, func(j *Job) { - j.Stage = "complete" - j.Message = fmt.Sprintf("Flash complete. Move the card into %s and power-cycle it.", j.Node) - j.ProgressPct = 100 - j.Artifact = output - }) -} - -func (a *App) flashArtifact(jobID, artifact string) error { - info, err := os.Stat(artifact) - if err != nil { - return err - } - a.setJob(jobID, func(j *Job) { - j.Stage = "flash" - j.Message = "Writing image to removable media" - j.ProgressPct = 82 - j.Total = info.Size() - }) - err = writer.WriteImageWithProgress(context.Background(), artifact, a.job(jobID).Device, func(written, total int64) { - pct := 82.0 - if total > 0 { - pct = 82.0 + (float64(written)/float64(total))*17.0 - } - a.setJob(jobID, func(j *Job) { - j.Written = written - j.Total = total - j.ProgressPct = pct - j.Message = fmt.Sprintf("Flashing %s of %s", humanBytes(written), humanBytes(total)) - }) - }) - return err -} - -func (a *App) ensureDevice(host, path string) (*Device, error) { - if strings.TrimSpace(path) == "" { - return nil, fmt.Errorf("select removable media before starting a flash run") - } - devices, err := a.ListDevices(host) - if err != nil { - return nil, err - } - for _, device := range devices { - if device.Path == path { - return &device, nil - } - } - return nil, fmt.Errorf("device %s is not a current removable flash candidate", path) -} - func (a *App) newJob(kind, node, host, device string) *Job { job := &Job{ ID: fmt.Sprintf("%d", time.Now().UTC().UnixNano()), @@ -619,27 +396,6 @@ func (a *App) recentEvents(limit int) []Event { return events } -func (a *App) artifacts() map[string]ArtifactSummary { - result := map[string]ArtifactSummary{} - for _, node := range a.inventory.Nodes { - path := a.artifactPath(node.Name) - info, err := os.Stat(path) - if err != nil { - continue - } - result[node.Name] = ArtifactSummary{ - Path: path, - UpdatedAt: info.ModTime().UTC(), - SizeBytes: info.Size(), - } - } - return result -} - -func (a *App) artifactPath(node string) string { - return filepath.Join(a.settings.ArtifactDir, fmt.Sprintf("%s.img", node)) -} - func cachedImageName(source string) string { return strings.TrimSuffix(filepath.Base(source), ".xz") } @@ -656,8 +412,10 @@ func (a *App) flashHosts() []string { hosts[value] = struct{}{} } } - for _, host := range clusterNodeNames() { - hosts[host] = struct{}{} + for _, node := range clusterNodes() { + if value := strings.TrimSpace(node.Name); value != "" { + hosts[value] = struct{}{} + } } out := make([]string, 0, len(hosts)) for host := range hosts { @@ -798,11 +556,6 @@ func errorString(err error) string { return err.Error() } -func (a *App) supportsLocalMedia(host string) bool { - host = strings.TrimSpace(host) - return host == "" || host == a.settings.LocalHost || host == a.settings.DefaultFlashHost -} - func deviceScore(device Device) int { score := 0 model := strings.ToLower(strings.TrimSpace(device.Model)) @@ -857,99 +610,9 @@ func deleteNodeObject(node string) error { } func deleteNodeObjectInCluster(node string) error { - host := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_HOST")) - port := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_PORT")) - if host == "" || port == "" { + kube, err := inClusterKubeClient() + if err != nil { return errors.New("not running in cluster") } - token, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") - if err != nil { - return err - } - caPEM, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/ca.crt") - if err != nil { - return err - } - pool := x509.NewCertPool() - if !pool.AppendCertsFromPEM(caPEM) { - return errors.New("append kubernetes CA") - } - client := &http.Client{ - Timeout: 15 * time.Second, - Transport: &http.Transport{ - TLSClientConfig: &tls.Config{RootCAs: pool}, - }, - } - req, err := http.NewRequest(http.MethodDelete, fmt.Sprintf("https://%s:%s/api/v1/nodes/%s", host, port, node), nil) - if err != nil { - return err - } - req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(string(token))) - resp, err := client.Do(req) - if err != nil { - return err - } - defer resp.Body.Close() - if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusAccepted { - return nil - } - body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) - return fmt.Errorf("delete node %s failed: %s: %s", node, resp.Status, strings.TrimSpace(string(body))) -} - -func clusterNodeNames() []string { - host := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_HOST")) - port := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_PORT")) - if host == "" || port == "" { - return nil - } - token, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") - if err != nil { - return nil - } - caPEM, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/ca.crt") - if err != nil { - return nil - } - pool := x509.NewCertPool() - if !pool.AppendCertsFromPEM(caPEM) { - return nil - } - client := &http.Client{ - Timeout: 10 * time.Second, - Transport: &http.Transport{ - TLSClientConfig: &tls.Config{RootCAs: pool}, - }, - } - req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("https://%s:%s/api/v1/nodes", host, port), nil) - if err != nil { - return nil - } - req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(string(token))) - resp, err := client.Do(req) - if err != nil { - return nil - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return nil - } - var payload struct { - Items []struct { - Metadata struct { - Name string `json:"name"` - } `json:"metadata"` - } `json:"items"` - } - if err := json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(&payload); err != nil { - return nil - } - names := make([]string, 0, len(payload.Items)) - for _, item := range payload.Items { - if name := strings.TrimSpace(item.Metadata.Name); name != "" { - names = append(names, name) - } - } - sort.Strings(names) - return names + return kube.deleteRequest(fmt.Sprintf("/api/v1/nodes/%s", node)) } diff --git a/pkg/service/artifacts.go b/pkg/service/artifacts.go new file mode 100644 index 0000000..e18c335 --- /dev/null +++ b/pkg/service/artifacts.go @@ -0,0 +1,52 @@ +package service + +import ( + "encoding/json" + "os" + "path/filepath" +) + +func (a *App) artifacts() map[string]ArtifactSummary { + a.mu.RLock() + defer a.mu.RUnlock() + result := make(map[string]ArtifactSummary, len(a.artifactStore)) + for key, value := range a.artifactStore { + result[key] = value + } + return result +} + +func (a *App) loadArtifacts() error { + data, err := os.ReadFile(a.settings.ArtifactStatePath) + if err != nil { + return err + } + var artifacts map[string]ArtifactSummary + if err := json.Unmarshal(data, &artifacts); err != nil { + return err + } + a.mu.Lock() + a.artifactStore = artifacts + a.mu.Unlock() + return nil +} + +func (a *App) persistArtifacts() error { + a.mu.RLock() + data, err := json.MarshalIndent(a.artifactStore, "", " ") + a.mu.RUnlock() + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(a.settings.ArtifactStatePath), 0o755); err != nil { + return err + } + return os.WriteFile(a.settings.ArtifactStatePath, data, 0o644) +} + +func (a *App) recordArtifact(summary ArtifactSummary) error { + a.mu.Lock() + a.artifactStore[summary.Node] = summary + a.mu.Unlock() + return a.persistArtifacts() +} diff --git a/pkg/service/cluster.go b/pkg/service/cluster.go new file mode 100644 index 0000000..2bc1677 --- /dev/null +++ b/pkg/service/cluster.go @@ -0,0 +1,278 @@ +package service + +import ( + "bytes" + "crypto/tls" + "crypto/x509" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "sort" + "strings" + "time" +) + +type clusterNode struct { + Name string + Arch string + Hardware string + Worker bool + ControlPlane bool + Unschedulable bool +} + +type podState struct { + Name string + Phase string + Reason string + Message string +} + +type kubeClient struct { + baseURL string + token string + client *http.Client +} + +func inClusterKubeClient() (*kubeClient, error) { + host := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_HOST")) + port := strings.TrimSpace(os.Getenv("KUBERNETES_SERVICE_PORT")) + if host == "" || port == "" { + return nil, fmt.Errorf("not running in cluster") + } + token, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") + if err != nil { + return nil, err + } + caPEM, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/ca.crt") + if err != nil { + return nil, err + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(caPEM) { + return nil, fmt.Errorf("append kubernetes CA") + } + return &kubeClient{ + baseURL: fmt.Sprintf("https://%s:%s", host, port), + token: strings.TrimSpace(string(token)), + client: &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{RootCAs: pool}, + }, + }, + }, nil +} + +func (k *kubeClient) jsonRequest(method, path string, body any, out any) error { + var reader io.Reader + if body != nil { + data, err := json.Marshal(body) + if err != nil { + return err + } + reader = bytes.NewReader(data) + } + req, err := http.NewRequest(method, k.baseURL+path, reader) + if err != nil { + return err + } + req.Header.Set("Authorization", "Bearer "+k.token) + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + resp, err := k.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode >= 300 { + payload, _ := io.ReadAll(io.LimitReader(resp.Body, 8192)) + return fmt.Errorf("%s %s failed: %s: %s", method, path, resp.Status, strings.TrimSpace(string(payload))) + } + if out == nil { + return nil + } + return json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(out) +} + +func (k *kubeClient) deleteRequest(path string) error { + req, err := http.NewRequest(http.MethodDelete, k.baseURL+path, nil) + if err != nil { + return err + } + req.Header.Set("Authorization", "Bearer "+k.token) + resp, err := k.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusAccepted { + return nil + } + payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("delete %s failed: %s: %s", path, resp.Status, strings.TrimSpace(string(payload))) +} + +func clusterNodes() []clusterNode { + kube, err := inClusterKubeClient() + if err != nil { + return nil + } + var payload struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Labels map[string]string `json:"labels"` + } `json:"metadata"` + Spec struct { + Unschedulable bool `json:"unschedulable"` + } `json:"spec"` + } `json:"items"` + } + if err := kube.jsonRequest(http.MethodGet, "/api/v1/nodes", nil, &payload); err != nil { + return nil + } + nodes := make([]clusterNode, 0, len(payload.Items)) + for _, item := range payload.Items { + labels := item.Metadata.Labels + nodes = append(nodes, clusterNode{ + Name: strings.TrimSpace(item.Metadata.Name), + Arch: strings.TrimSpace(labels["kubernetes.io/arch"]), + Hardware: strings.TrimSpace(labels["hardware"]), + Worker: labels["node-role.kubernetes.io/worker"] == "true", + ControlPlane: labels["node-role.kubernetes.io/control-plane"] != "" || labels["node-role.kubernetes.io/master"] != "", + Unschedulable: item.Spec.Unschedulable, + }) + } + sort.Slice(nodes, func(i, j int) bool { return nodes[i].Name < nodes[j].Name }) + return nodes +} + +func (a *App) podImageForArch(arch string) string { + switch strings.TrimSpace(arch) { + case "arm64": + return strings.TrimSpace(a.settings.RunnerImageARM64) + case "amd64": + return strings.TrimSpace(a.settings.RunnerImageAMD64) + default: + return "" + } +} + +func (a *App) runRemotePod(jobID, podName string, podSpec map[string]any) (string, error) { + kube, err := inClusterKubeClient() + if err != nil { + return "", err + } + ns := url.PathEscape(a.settings.Namespace) + _ = kube.deleteRequest(fmt.Sprintf("/api/v1/namespaces/%s/pods/%s", ns, url.PathEscape(podName))) + defer func() { + _ = kube.deleteRequest(fmt.Sprintf("/api/v1/namespaces/%s/pods/%s", ns, url.PathEscape(podName))) + }() + if err := kube.jsonRequest(http.MethodPost, fmt.Sprintf("/api/v1/namespaces/%s/pods", ns), podSpec, nil); err != nil { + return "", err + } + + deadline := time.Now().Add(12 * time.Minute) + for time.Now().Before(deadline) { + state, err := a.remotePodState(kube, podName) + if err != nil { + return "", err + } + switch state.Phase { + case "Succeeded": + return a.remotePodLogs(kube, podName) + case "Failed": + logs, _ := a.remotePodLogs(kube, podName) + if strings.TrimSpace(logs) != "" { + return "", fmt.Errorf("remote pod %s failed: %s", podName, strings.TrimSpace(logs)) + } + return "", fmt.Errorf("remote pod %s failed: %s %s", podName, state.Reason, state.Message) + } + time.Sleep(2 * time.Second) + } + return "", fmt.Errorf("remote pod %s timed out", podName) +} + +func (a *App) remotePodState(kube *kubeClient, podName string) (podState, error) { + var payload struct { + Metadata struct { + Name string `json:"name"` + } `json:"metadata"` + Status struct { + Phase string `json:"phase"` + Reason string `json:"reason"` + Message string `json:"message"` + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + Reason string `json:"reason"` + Message string `json:"message"` + } `json:"conditions"` + ContainerStatuses []struct { + State struct { + Waiting struct { + Reason string `json:"reason"` + Message string `json:"message"` + } `json:"waiting"` + Terminated struct { + Reason string `json:"reason"` + Message string `json:"message"` + } `json:"terminated"` + } `json:"state"` + } `json:"containerStatuses"` + } `json:"status"` + } + ns := url.PathEscape(a.settings.Namespace) + if err := kube.jsonRequest(http.MethodGet, fmt.Sprintf("/api/v1/namespaces/%s/pods/%s", ns, url.PathEscape(podName)), nil, &payload); err != nil { + return podState{}, err + } + out := podState{ + Name: payload.Metadata.Name, + Phase: payload.Status.Phase, + Reason: payload.Status.Reason, + Message: payload.Status.Message, + } + if len(payload.Status.ContainerStatuses) > 0 { + waiting := payload.Status.ContainerStatuses[0].State.Waiting + terminated := payload.Status.ContainerStatuses[0].State.Terminated + if strings.TrimSpace(waiting.Reason) != "" { + out.Reason = waiting.Reason + out.Message = waiting.Message + } + if strings.TrimSpace(terminated.Reason) != "" { + out.Reason = terminated.Reason + if strings.TrimSpace(terminated.Message) != "" { + out.Message = terminated.Message + } + } + } + return out, nil +} + +func (a *App) remotePodLogs(kube *kubeClient, podName string) (string, error) { + ns := url.PathEscape(a.settings.Namespace) + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("%s/api/v1/namespaces/%s/pods/%s/log", kube.baseURL, ns, url.PathEscape(podName)), nil) + if err != nil { + return "", err + } + req.Header.Set("Authorization", "Bearer "+kube.token) + resp, err := kube.client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return "", fmt.Errorf("pod logs %s failed: %s: %s", podName, resp.Status, strings.TrimSpace(string(body))) + } + body, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if err != nil { + return "", err + } + return string(body), nil +} diff --git a/pkg/service/harbor.go b/pkg/service/harbor.go new file mode 100644 index 0000000..cf5d61a --- /dev/null +++ b/pkg/service/harbor.go @@ -0,0 +1,131 @@ +package service + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "sort" + "strings" + "time" +) + +func (a *App) artifactRepo(node string) string { + return fmt.Sprintf("%s/%s/%s", strings.TrimRight(a.settings.HarborRegistry, "/"), strings.Trim(a.settings.HarborProject, "/"), node) +} + +func (a *App) ensureHarborProject() error { + if strings.TrimSpace(a.settings.HarborAPIBase) == "" || strings.TrimSpace(a.settings.HarborPassword) == "" { + return fmt.Errorf("harbor admin credentials are not configured") + } + client := &http.Client{Timeout: 30 * time.Second} + project := strings.TrimSpace(a.settings.HarborProject) + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("%s/projects?name=%s", strings.TrimRight(a.settings.HarborAPIBase, "/"), url.QueryEscape(project)), nil) + if err != nil { + return err + } + req.SetBasicAuth(strings.TrimSpace(a.settings.HarborUsername), strings.TrimSpace(a.settings.HarborPassword)) + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("harbor project lookup failed: %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + var projects []struct { + Name string `json:"name"` + } + if err := json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(&projects); err != nil { + return err + } + for _, item := range projects { + if strings.EqualFold(strings.TrimSpace(item.Name), project) { + return nil + } + } + payload := map[string]any{ + "project_name": project, + "metadata": map[string]string{"public": "false"}, + } + data, err := json.Marshal(payload) + if err != nil { + return err + } + req, err = http.NewRequest(http.MethodPost, fmt.Sprintf("%s/projects", strings.TrimRight(a.settings.HarborAPIBase, "/")), bytes.NewReader(data)) + if err != nil { + return err + } + req.SetBasicAuth(strings.TrimSpace(a.settings.HarborUsername), strings.TrimSpace(a.settings.HarborPassword)) + req.Header.Set("Content-Type", "application/json") + resp, err = client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode == http.StatusCreated || resp.StatusCode == http.StatusConflict { + return nil + } + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("harbor project create failed: %s: %s", resp.Status, strings.TrimSpace(string(body))) +} + +func (a *App) pruneHarborArtifacts(node string, keep int) error { + client := &http.Client{Timeout: 30 * time.Second} + repo := url.PathEscape(node) + apiBase := strings.TrimRight(a.settings.HarborAPIBase, "/") + project := url.PathEscape(strings.TrimSpace(a.settings.HarborProject)) + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("%s/projects/%s/repositories/%s/artifacts?page_size=100&with_tag=true", apiBase, project, repo), nil) + if err != nil { + return err + } + req.SetBasicAuth(strings.TrimSpace(a.settings.HarborUsername), strings.TrimSpace(a.settings.HarborPassword)) + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode == http.StatusNotFound { + return nil + } + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("harbor artifact list failed: %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + var artifacts []struct { + Digest string `json:"digest"` + PushTime string `json:"push_time"` + Tags []struct { + Name string `json:"name"` + } `json:"tags"` + } + if err := json.NewDecoder(io.LimitReader(resp.Body, 2<<20)).Decode(&artifacts); err != nil { + return err + } + sort.Slice(artifacts, func(i, j int) bool { + return artifacts[i].PushTime > artifacts[j].PushTime + }) + for idx, artifact := range artifacts { + if idx < keep { + continue + } + ref := url.PathEscape(artifact.Digest) + req, err := http.NewRequest(http.MethodDelete, fmt.Sprintf("%s/projects/%s/repositories/%s/artifacts/%s", apiBase, project, repo, ref), nil) + if err != nil { + return err + } + req.SetBasicAuth(strings.TrimSpace(a.settings.HarborUsername), strings.TrimSpace(a.settings.HarborPassword)) + resp, err := client.Do(req) + if err != nil { + return err + } + resp.Body.Close() + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted && resp.StatusCode != http.StatusNotFound { + return fmt.Errorf("harbor artifact delete failed for %s: %s", artifact.Digest, resp.Status) + } + } + return nil +} diff --git a/pkg/service/remote.go b/pkg/service/remote.go new file mode 100644 index 0000000..9da356b --- /dev/null +++ b/pkg/service/remote.go @@ -0,0 +1,455 @@ +package service + +import ( + "encoding/json" + "fmt" + "path/filepath" + "sort" + "strings" + "time" + + "metis/pkg/inventory" +) + +const hostTmpDevicePath = "hosttmp:///tmp" + +func (a *App) ListDevices(host string) ([]Device, error) { + if host == "" { + host = a.settings.DefaultFlashHost + } + nodeMap := map[string]clusterNode{} + for _, node := range clusterNodes() { + nodeMap[node.Name] = node + } + target, ok := nodeMap[host] + if !ok { + return nil, fmt.Errorf("flash host %s is not a current cluster node", host) + } + image := a.podImageForArch(target.Arch) + if image == "" { + return nil, fmt.Errorf("no runner image configured for arch %s", target.Arch) + } + podName := fmt.Sprintf("metis-devices-%d", time.Now().UTC().UnixNano()) + logs, err := a.runRemotePod("", podName, a.remoteDevicePodSpec(podName, host, image)) + if err != nil { + return nil, err + } + var payload struct { + Devices []Device `json:"devices"` + } + if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &payload); err != nil { + return nil, fmt.Errorf("decode remote devices: %w: %s", err, strings.TrimSpace(logs)) + } + sort.Slice(payload.Devices, func(i, j int) bool { + left := deviceScore(payload.Devices[i]) + right := deviceScore(payload.Devices[j]) + if left != right { + return left > right + } + if payload.Devices[i].SizeBytes != payload.Devices[j].SizeBytes { + return payload.Devices[i].SizeBytes < payload.Devices[j].SizeBytes + } + return payload.Devices[i].Path < payload.Devices[j].Path + }) + return payload.Devices, nil +} + +func (a *App) runBuild(job *Job, flash bool) { + nodeSpec, class, err := a.inventory.FindNode(job.Node) + if err != nil { + a.failJob(job.ID, err) + a.metrics.RecordBuild(job.Node, "error") + return + } + if err := a.ensureHarborProject(); err != nil { + a.failJob(job.ID, err) + a.metrics.RecordBuild(job.Node, "error") + return + } + + builder, err := a.selectBuilderHost(class.Arch, job.Host) + if err != nil { + a.failJob(job.ID, err) + a.metrics.RecordBuild(job.Node, "error") + return + } + job.Builder = builder.Name + buildTag := time.Now().UTC().Format("20060102t150405z") + artifactRef := a.artifactRepo(job.Node) + a.setJob(job.ID, func(j *Job) { + j.Status = JobRunning + j.Stage = "build" + j.Message = fmt.Sprintf("Building on %s (%s) and publishing to Harbor", builder.Name, builder.Arch) + j.ProgressPct = 8 + j.Artifact = artifactRef + ":latest" + j.Builder = builder.Name + }) + + buildImage := a.podImageForArch(builder.Arch) + if buildImage == "" { + a.failJob(job.ID, fmt.Errorf("no runner image configured for arch %s", builder.Arch)) + a.metrics.RecordBuild(job.Node, "error") + return + } + buildPod := fmt.Sprintf("metis-build-%d", time.Now().UTC().UnixNano()) + logs, err := a.runRemotePod(job.ID, buildPod, a.remoteBuildPodSpec(buildPod, builder.Name, buildImage, job.Node, artifactRef, buildTag)) + if err != nil { + a.failJob(job.ID, err) + a.metrics.RecordBuild(job.Node, "error") + return + } + var summary ArtifactSummary + if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &summary); err != nil { + a.failJob(job.ID, fmt.Errorf("decode remote build output: %w: %s", err, strings.TrimSpace(logs))) + a.metrics.RecordBuild(job.Node, "error") + return + } + summary.Node = job.Node + summary.Ref = artifactRef + ":latest" + summary.BuilderHost = builder.Name + if err := a.recordArtifact(summary); err != nil { + a.failJob(job.ID, err) + a.metrics.RecordBuild(job.Node, "error") + return + } + if err := a.pruneHarborArtifacts(job.Node, 3); err != nil { + a.appendEvent(Event{ + Time: time.Now().UTC(), + Kind: "artifact.prune.warning", + Summary: fmt.Sprintf("Harbor cleanup warning for %s", job.Node), + Details: map[string]any{"node": job.Node, "error": err.Error()}, + }) + } + a.metrics.RecordBuild(job.Node, "ok") + a.appendEvent(Event{ + Time: time.Now().UTC(), + Kind: "image.build", + Summary: fmt.Sprintf("Built replacement image for %s on %s", job.Node, builder.Name), + Details: map[string]any{"node": job.Node, "artifact": artifactRef + ":latest", "builder": builder.Name}, + }) + + if !flash { + a.completeJob(job.ID, func(j *Job) { + j.Stage = "complete" + j.Message = "Image build complete" + j.ProgressPct = 100 + j.Artifact = artifactRef + ":latest" + }) + return + } + + a.setJob(job.ID, func(j *Job) { + j.Stage = "preflight" + j.Message = fmt.Sprintf("Preparing to flash from Harbor on %s", j.Host) + j.ProgressPct = 78 + j.Artifact = artifactRef + ":latest" + }) + if _, err := a.ensureDevice(job.Host, job.Device); err != nil { + a.failJob(job.ID, err) + a.metrics.RecordFlash(job.Node, job.Host, "error") + return + } + if !strings.HasPrefix(job.Device, "hosttmp://") { + if err := deleteNodeObject(job.Node); err != nil { + a.appendEvent(Event{ + Time: time.Now().UTC(), + Kind: "node.delete.warning", + Summary: fmt.Sprintf("Could not delete stale Kubernetes node object for %s", job.Node), + Details: map[string]any{"node": job.Node, "error": err.Error()}, + }) + } + } + if err := a.flashArtifact(job.ID, artifactRef); err != nil { + a.failJob(job.ID, err) + a.metrics.RecordFlash(job.Node, job.Host, "error") + return + } + a.metrics.RecordFlash(job.Node, job.Host, "ok") + a.appendEvent(Event{ + Time: time.Now().UTC(), + Kind: "image.flash", + Summary: fmt.Sprintf("Flashed %s latest image on %s", job.Node, job.Host), + Details: map[string]any{"node": job.Node, "device": job.Device, "host": job.Host, "artifact": artifactRef + ":latest"}, + }) + a.completeJob(job.ID, func(j *Job) { + j.Stage = "complete" + if strings.HasPrefix(j.Device, "hosttmp://") { + j.Message = fmt.Sprintf("Test flash complete on %s host /tmp.", j.Host) + } else { + j.Message = fmt.Sprintf("Flash complete on %s. Move the card into %s and power-cycle it.", j.Host, j.Node) + } + j.ProgressPct = 100 + j.Artifact = artifactRef + ":latest" + }) + + _ = nodeSpec +} + +func (a *App) flashArtifact(jobID, artifactRef string) error { + nodes := clusterNodes() + nodeMap := map[string]clusterNode{} + for _, node := range nodes { + nodeMap[node.Name] = node + } + target, ok := nodeMap[a.job(jobID).Host] + if !ok { + return fmt.Errorf("flash host %s is not a current cluster node", a.job(jobID).Host) + } + image := a.podImageForArch(target.Arch) + if image == "" { + return fmt.Errorf("no runner image configured for arch %s", target.Arch) + } + a.setJob(jobID, func(j *Job) { + j.Stage = "flash" + j.Message = fmt.Sprintf("Pulling %s and writing it on %s", artifactRef+":latest", j.Host) + j.ProgressPct = 84 + }) + podName := fmt.Sprintf("metis-flash-%d", time.Now().UTC().UnixNano()) + logs, err := a.runRemotePod(jobID, podName, a.remoteFlashPodSpec(podName, target.Name, image, a.job(jobID).Node, a.job(jobID).Device, artifactRef)) + if err != nil { + return err + } + var payload map[string]any + if err := json.Unmarshal([]byte(strings.TrimSpace(logs)), &payload); err == nil { + a.setJob(jobID, func(j *Job) { + if dest, ok := payload["dest_path"].(string); ok && dest != "" { + j.Message = fmt.Sprintf("Wrote latest artifact to %s", dest) + } + }) + } + return nil +} + +func (a *App) ensureDevice(host, path string) (*Device, error) { + if strings.TrimSpace(path) == "" { + return nil, fmt.Errorf("select removable media before starting a flash run") + } + devices, err := a.ListDevices(host) + if err != nil { + return nil, err + } + for _, device := range devices { + if device.Path == path { + return &device, nil + } + } + return nil, fmt.Errorf("device %s is not a current flash candidate on %s", path, host) +} + +func (a *App) selectBuilderHost(arch, flashHost string) (clusterNode, error) { + nodes := clusterNodes() + storageNodes := map[string]struct{}{} + for _, node := range a.inventory.Nodes { + if len(node.LonghornDisks) > 0 { + storageNodes[node.Name] = struct{}{} + } + } + type scored struct { + node clusterNode + score int + } + candidates := make([]scored, 0) + for _, node := range nodes { + if node.Arch != arch || node.Unschedulable || node.ControlPlane { + continue + } + score := 0 + if node.Worker { + score += 40 + } + switch arch { + case "arm64": + if node.Hardware == "rpi5" { + score += 30 + } + if _, storage := storageNodes[node.Name]; storage { + score -= 50 + } + case "amd64": + if node.Name == a.settings.DefaultFlashHost { + score += 30 + } + if node.Name == "titan-24" { + score -= 10 + } + } + if flashHost != "" && node.Name == flashHost { + score += 5 + } + candidates = append(candidates, scored{node: node, score: score}) + } + sort.Slice(candidates, func(i, j int) bool { + if candidates[i].score != candidates[j].score { + return candidates[i].score > candidates[j].score + } + return candidates[i].node.Name < candidates[j].node.Name + }) + if len(candidates) == 0 { + return clusterNode{}, fmt.Errorf("no build host available for arch %s", arch) + } + return candidates[0].node, nil +} + +func (a *App) remoteDevicePodSpec(name, host, image string) map[string]any { + return map[string]any{ + "apiVersion": "v1", + "kind": "Pod", + "metadata": map[string]any{ + "name": name, + "namespace": a.settings.Namespace, + "labels": map[string]string{"app": "metis-remote", "metis-run": "devices"}, + }, + "spec": map[string]any{ + "restartPolicy": "Never", + "serviceAccountName": "metis", + "nodeSelector": map[string]string{ + "kubernetes.io/hostname": host, + }, + "containers": []map[string]any{ + { + "name": "remote-devices", + "image": image, + "imagePullPolicy": "Always", + "command": []string{ + "metis", "remote-devices", + "--max-device-bytes", fmt.Sprintf("%d", a.settings.MaxDeviceBytes), + "--host-tmp-dir", filepath.Join("/host-tmp", strings.TrimPrefix(a.settings.HostTmpDir, "/")), + }, + "securityContext": map[string]any{"privileged": true, "runAsUser": 0}, + "volumeMounts": []map[string]any{ + {"name": "host-dev", "mountPath": "/dev"}, + {"name": "host-sys", "mountPath": "/sys", "readOnly": true}, + {"name": "host-udev", "mountPath": "/run/udev", "readOnly": true}, + {"name": "host-tmp", "mountPath": "/host-tmp"}, + }, + }, + }, + "imagePullSecrets": []map[string]string{{"name": "harbor-regcred"}}, + "volumes": []map[string]any{ + {"name": "host-dev", "hostPath": map[string]any{"path": "/dev"}}, + {"name": "host-sys", "hostPath": map[string]any{"path": "/sys"}}, + {"name": "host-udev", "hostPath": map[string]any{"path": "/run/udev"}}, + {"name": "host-tmp", "hostPath": map[string]any{"path": "/tmp"}}, + }, + }, + } +} + +func (a *App) remoteBuildPodSpec(name, host, image, node, artifactRef, buildTag string) map[string]any { + return map[string]any{ + "apiVersion": "v1", + "kind": "Pod", + "metadata": map[string]any{ + "name": name, + "namespace": a.settings.Namespace, + "labels": map[string]string{"app": "metis-remote", "metis-run": "build"}, + }, + "spec": map[string]any{ + "restartPolicy": "Never", + "serviceAccountName": "metis", + "nodeSelector": map[string]string{ + "kubernetes.io/hostname": host, + }, + "containers": []map[string]any{ + { + "name": "remote-build", + "image": image, + "imagePullPolicy": "Always", + "command": []string{ + "metis", "remote-build", + "--inventory", a.settings.InventoryPath, + "--node", node, + "--cache", "/workspace/cache", + "--work-dir", "/workspace/build", + "--artifact-ref", artifactRef, + "--build-tag", buildTag, + "--harbor-registry", a.settings.HarborRegistry, + }, + "envFrom": []map[string]any{ + {"configMapRef": map[string]any{"name": "metis"}}, + {"secretRef": map[string]any{"name": "metis-harbor"}}, + }, + "env": []map[string]any{ + {"name": "METIS_K3S_TOKEN", "valueFrom": map[string]any{"secretKeyRef": map[string]any{"name": "metis-runtime", "key": "k3s_token", "optional": true}}}, + }, + "volumeMounts": []map[string]any{ + {"name": "workspace", "mountPath": "/workspace"}, + }, + }, + }, + "imagePullSecrets": []map[string]string{{"name": "harbor-regcred"}}, + "volumes": []map[string]any{ + {"name": "workspace", "emptyDir": map[string]any{}}, + }, + }, + } +} + +func (a *App) remoteFlashPodSpec(name, host, image, node, device, artifactRef string) map[string]any { + return map[string]any{ + "apiVersion": "v1", + "kind": "Pod", + "metadata": map[string]any{ + "name": name, + "namespace": a.settings.Namespace, + "labels": map[string]string{"app": "metis-remote", "metis-run": "flash"}, + }, + "spec": map[string]any{ + "restartPolicy": "Never", + "serviceAccountName": "metis", + "nodeSelector": map[string]string{ + "kubernetes.io/hostname": host, + }, + "containers": []map[string]any{ + { + "name": "remote-flash", + "image": image, + "imagePullPolicy": "Always", + "command": []string{ + "metis", "remote-flash", + "--node", node, + "--device", device, + "--artifact-ref", artifactRef, + "--work-dir", "/workspace/flash", + "--harbor-registry", a.settings.HarborRegistry, + "--host-tmp-dir", filepath.Join("/host-tmp", strings.TrimPrefix(a.settings.HostTmpDir, "/")), + }, + "securityContext": map[string]any{"privileged": true, "runAsUser": 0}, + "envFrom": []map[string]any{ + {"configMapRef": map[string]any{"name": "metis"}}, + {"secretRef": map[string]any{"name": "metis-harbor"}}, + }, + "volumeMounts": []map[string]any{ + {"name": "workspace", "mountPath": "/workspace"}, + {"name": "host-dev", "mountPath": "/dev"}, + {"name": "host-sys", "mountPath": "/sys", "readOnly": true}, + {"name": "host-udev", "mountPath": "/run/udev", "readOnly": true}, + {"name": "host-tmp", "mountPath": "/host-tmp"}, + }, + }, + }, + "imagePullSecrets": []map[string]string{{"name": "harbor-regcred"}}, + "volumes": []map[string]any{ + {"name": "workspace", "emptyDir": map[string]any{}}, + {"name": "host-dev", "hostPath": map[string]any{"path": "/dev"}}, + {"name": "host-sys", "hostPath": map[string]any{"path": "/sys"}}, + {"name": "host-udev", "hostPath": map[string]any{"path": "/run/udev"}}, + {"name": "host-tmp", "hostPath": map[string]any{"path": "/tmp"}}, + }, + }, + } +} + +func (a *App) remoteArtifactNote(node string) string { + if summary, ok := a.artifacts()[node]; ok && strings.TrimSpace(summary.Ref) != "" { + return summary.Ref + } + return a.artifactRepo(node) + ":latest" +} + +func inventoryNodeArch(spec *inventory.NodeSpec, class *inventory.NodeClass) string { + if class != nil && strings.TrimSpace(class.Arch) != "" { + return strings.TrimSpace(class.Arch) + } + return "arm64" +} diff --git a/pkg/service/server.go b/pkg/service/server.go index 455e638..e1e2481 100644 --- a/pkg/service/server.go +++ b/pkg/service/server.go @@ -725,10 +725,7 @@ var metisPage = template.Must(template.New("metis").Parse(` } const selectedHost = hostSelect.value || state.default_flash_host; - const hostIsLocal = selectedHost === state.local_host || selectedHost === state.default_flash_host; - hostNoteEl.textContent = hostIsLocal - ? 'Metis is running on ' + state.local_host + ', so media detection and flashing are live for this host.' - : 'The selected host is listed from cluster inventory, but this Metis instance only has direct media access on ' + state.local_host + '.'; + hostNoteEl.textContent = 'Metis will inspect media and run the flash writer on ' + selectedHost + ' through a short-lived in-cluster worker. ' + state.default_flash_host + ' remains the default flash host.'; if(state.device_error){ deviceNoteEl.textContent = state.device_error; @@ -739,9 +736,9 @@ var metisPage = template.Must(template.New("metis").Parse(` } const artifact = (state.artifacts || {})[nodeSelect.value]; - artifactNoteEl.textContent = artifact && artifact.path - ? 'Latest built image: ' + artifact.path - : 'Successful build-only runs are stored on ' + state.local_host + ' under /var/lib/metis/artifacts/.img.'; + artifactNoteEl.textContent = artifact && artifact.ref + ? 'Latest published image: ' + artifact.ref + ' (Metis keeps the newest 3 builds in Harbor).' + : 'Successful build-only runs publish :latest into Harbor and keep the newest 3 builds per node.'; document.getElementById('build-only').disabled = busy || !nodeSelect.value; document.getElementById('refresh-devices').disabled = busy; @@ -825,7 +822,7 @@ var metisPage = template.Must(template.New("metis").Parse(` await runAction('Starting image build', 'Queueing the node image build now.', async ()=>{ await post('/api/jobs/build', {node: nodeSelect.value}); await refreshState({silent:true}); - banner('success', 'Image build queued', 'Metis started building the replacement image for ' + nodeSelect.value + '. Successful build-only runs land on ' + state.local_host + ' at /var/lib/metis/artifacts/' + nodeSelect.value + '.img.'); + banner('success', 'Image build queued', 'Metis started building the replacement image for ' + nodeSelect.value + '. Successful build-only runs publish ' + nodeSelect.value + ':latest to Harbor and keep the newest 3 builds.'); }); }); diff --git a/pkg/service/settings.go b/pkg/service/settings.go index 78e58f6..828440e 100644 --- a/pkg/service/settings.go +++ b/pkg/service/settings.go @@ -9,18 +9,28 @@ import ( // Settings configures the Metis service runtime. type Settings struct { - BindAddr string - InventoryPath string - CacheDir string - ArtifactDir string - HistoryPath string - SnapshotsPath string - TargetsPath string - DefaultFlashHost string - FlashHosts []string - LocalHost string - AllowedGroups []string - MaxDeviceBytes int64 + BindAddr string + InventoryPath string + CacheDir string + ArtifactDir string + ArtifactStatePath string + HistoryPath string + SnapshotsPath string + TargetsPath string + DefaultFlashHost string + FlashHosts []string + LocalHost string + AllowedGroups []string + MaxDeviceBytes int64 + Namespace string + RunnerImageAMD64 string + RunnerImageARM64 string + HarborRegistry string + HarborProject string + HarborAPIBase string + HarborUsername string + HarborPassword string + HostTmpDir string } // FromEnv builds service settings with sensible defaults for local dev and in-cluster use. @@ -30,18 +40,28 @@ func FromEnv() Settings { defaultFlashHost := getenvDefault("METIS_DEFAULT_FLASH_HOST", localHost) flashHosts := splitList(getenvDefault("METIS_FLASH_HOSTS", defaultFlashHost)) return Settings{ - BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"), - InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"), - CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")), - ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")), - HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")), - SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")), - TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")), - DefaultFlashHost: defaultFlashHost, - FlashHosts: flashHosts, - LocalHost: localHost, - AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintainer")), - MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000), + BindAddr: getenvDefault("METIS_BIND_ADDR", ":8080"), + InventoryPath: getenvDefault("METIS_INVENTORY_PATH", "inventory.titan-rpi4.yaml"), + CacheDir: getenvDefault("METIS_CACHE_DIR", filepath.Join(dataDir, "cache")), + ArtifactDir: getenvDefault("METIS_ARTIFACT_DIR", filepath.Join(dataDir, "artifacts")), + ArtifactStatePath: getenvDefault("METIS_ARTIFACT_STATE_PATH", filepath.Join(dataDir, "artifacts.json")), + HistoryPath: getenvDefault("METIS_HISTORY_PATH", filepath.Join(dataDir, "history.jsonl")), + SnapshotsPath: getenvDefault("METIS_SNAPSHOTS_PATH", filepath.Join(dataDir, "snapshots.json")), + TargetsPath: getenvDefault("METIS_TARGETS_PATH", filepath.Join(dataDir, "targets.json")), + DefaultFlashHost: defaultFlashHost, + FlashHosts: flashHosts, + LocalHost: localHost, + AllowedGroups: splitList(getenvDefault("METIS_ALLOWED_GROUPS", "admin,maintainer")), + MaxDeviceBytes: getenvInt64("METIS_MAX_DEVICE_BYTES", 300000000000), + Namespace: getenvDefault("METIS_NAMESPACE", "maintenance"), + RunnerImageAMD64: getenvDefault("METIS_RUNNER_IMAGE_AMD64", ""), + RunnerImageARM64: getenvDefault("METIS_RUNNER_IMAGE_ARM64", ""), + HarborRegistry: getenvDefault("METIS_HARBOR_REGISTRY", "registry.bstein.dev"), + HarborProject: getenvDefault("METIS_HARBOR_PROJECT", "metis"), + HarborAPIBase: getenvDefault("METIS_HARBOR_API_BASE", "https://registry.bstein.dev/api/v2.0"), + HarborUsername: getenvDefault("METIS_HARBOR_USERNAME", ""), + HarborPassword: getenvDefault("METIS_HARBOR_PASSWORD", ""), + HostTmpDir: getenvDefault("METIS_HOST_TMP_DIR", "/tmp/metis-flash-test"), } }