From c972f226da200f52e4296db3e8383afe3865a296 Mon Sep 17 00:00:00 2001 From: codex Date: Wed, 22 Apr 2026 03:57:55 -0300 Subject: [PATCH] fix(metis): publish scratch health from sentinels --- pkg/sentinel/collector.go | 77 +++++++++++++++++++++-- pkg/sentinel/collector_test.go | 55 +++++++++++++++++ pkg/service/app.go | 4 ++ pkg/service/cluster.go | 40 ++++++++++++ pkg/service/node_annotations.go | 85 +++++++++++++++++++++++++ pkg/service/node_annotations_test.go | 92 ++++++++++++++++++++++++++++ 6 files changed, 347 insertions(+), 6 deletions(-) create mode 100644 pkg/service/node_annotations.go create mode 100644 pkg/service/node_annotations_test.go diff --git a/pkg/sentinel/collector.go b/pkg/sentinel/collector.go index 4875aee..7e686e7 100644 --- a/pkg/sentinel/collector.go +++ b/pkg/sentinel/collector.go @@ -49,12 +49,8 @@ func Collect() *Snapshot { } func collectUSBScratch() *facts.USBScratch { - raw, err := commandOutput("cat", "/etc/metis/node.json") - if err != nil || len(strings.TrimSpace(string(raw))) == 0 { - return nil - } - var cfg nodeConfig - if err := json.Unmarshal(raw, &cfg); err != nil || cfg.USBScratch == nil { + cfg, ok := loadNodeConfig() + if !ok || cfg.USBScratch == nil { return nil } desired := cfg.USBScratch @@ -114,6 +110,75 @@ func collectUSBScratch() *facts.USBScratch { return scratch } +func loadNodeConfig() (nodeConfig, bool) { + raw, err := commandOutput("cat", "/etc/metis/node.json") + if err == nil && len(strings.TrimSpace(string(raw))) > 0 { + var cfg nodeConfig + if err := json.Unmarshal(raw, &cfg); err == nil && cfg.USBScratch != nil { + return cfg, true + } + } + raw, err = commandOutput("cat", "/etc/fstab") + if err != nil { + return nodeConfig{}, false + } + if scratch, ok := parseUSBScratchFstab(string(raw)); ok { + return nodeConfig{USBScratch: scratch}, true + } + return nodeConfig{}, false +} + +func parseUSBScratchFstab(raw string) (*usbScratchConfig, bool) { + cfg := &usbScratchConfig{} + inBlock := false + for _, line := range strings.Split(raw, "\n") { + line = strings.TrimSpace(line) + switch { + case strings.HasPrefix(line, "# BEGIN maintenance.bstein.dev usb-scratch"): + inBlock = true + continue + case strings.HasPrefix(line, "# END maintenance.bstein.dev usb-scratch"): + inBlock = false + continue + case !inBlock || line == "" || strings.HasPrefix(line, "#"): + continue + } + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + source, target, fsType, options := fields[0], fields[1], fields[2], fields[3] + if fsType == "none" && hasFstabOption(options, "bind") { + cfg.BindTargets = append(cfg.BindTargets, target) + continue + } + if fsType == "tmpfs" || strings.EqualFold(source, "tmpfs") { + continue + } + cfg.Mountpoint = target + cfg.FS = fsType + if value, ok := strings.CutPrefix(source, "UUID="); ok { + cfg.UUID = value + } + if value, ok := strings.CutPrefix(source, "LABEL="); ok { + cfg.Label = value + } + } + if cfg.Mountpoint == "" { + return nil, false + } + return cfg, true +} + +func hasFstabOption(options, want string) bool { + for _, option := range strings.Split(options, ",") { + if option == want { + return true + } + } + return false +} + func runAndTrim(cmd string, args ...string) string { out, err := commandOutput(cmd, args...) if err != nil { diff --git a/pkg/sentinel/collector_test.go b/pkg/sentinel/collector_test.go index 12cba08..699d13e 100644 --- a/pkg/sentinel/collector_test.go +++ b/pkg/sentinel/collector_test.go @@ -120,6 +120,61 @@ esac`, } } +func TestCollectUSBScratchFallsBackToManagedFstabBlock(t *testing.T) { + dir := fakeCollectorCommands(t, map[string]string{ + "cat": `case "${1:-}" in + /etc/metis/node.json) exit 1 ;; + /etc/fstab) printf '%s\n' '# BEGIN maintenance.bstein.dev usb-scratch' \ + 'UUID=usb-1 /mnt/astraios ext4 defaults,noatime 0 2' \ + 'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0' \ + '/mnt/astraios/var/log/pods /var/log/pods none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \ + '/mnt/astraios/var/tmp /var/tmp none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \ + '# END maintenance.bstein.dev usb-scratch' + ;; + *) exit 1 ;; +esac`, + "findmnt": `target="" +for ((i=1; i<=$#; i++)); do + if [[ "${!i}" == "-T" ]]; then + j=$((i + 1)) + target="${!j}" + break + fi +done +case "${target}" in + /mnt/astraios) printf 'SOURCE="/dev/sda1" TARGET="/mnt/astraios" FSTYPE="ext4"\n' ;; + /var/log/pods) printf 'SOURCE="/dev/sda1[/var/log/pods]" TARGET="/var/log/pods" FSTYPE="none"\n' ;; + /var/tmp) printf 'SOURCE="/dev/sda1[/var/tmp]" TARGET="/var/tmp" FSTYPE="none"\n' ;; + *) exit 1 ;; +esac`, + "readlink": `case "${2:-}" in + /mnt/astraios) printf '/mnt/astraios\n' ;; + /var/log/pods) printf '/mnt/astraios/var/log/pods\n' ;; + /var/tmp) printf '/mnt/astraios/var/tmp\n' ;; + *) exit 1 ;; +esac`, + "blkid": `case "${1:-}" in + -U) printf '/dev/sda1\n' ;; + -o) printf 'UUID=usb-1\nTYPE=ext4\n' ;; +esac`, + }) + t.Setenv("PATH", dir+string(os.PathListSeparator)+os.Getenv("PATH")) + + scratch := collectUSBScratch() + if scratch == nil { + t.Fatal("expected USB scratch data from fstab") + } + if scratch.Mountpoint != "/mnt/astraios" || scratch.UUID != "usb-1" || scratch.FS != "ext4" { + t.Fatalf("unexpected scratch identity: %#v", scratch) + } + if !scratch.MountHealthy || !scratch.UUIDHealthy || !scratch.BindHealthy { + t.Fatalf("expected fstab-derived scratch to be healthy: %#v", scratch) + } + if len(scratch.BindTargets) != 2 || scratch.BindTargets[0].Path != "/var/log/pods" || scratch.BindTargets[1].Path != "/var/tmp" { + t.Fatalf("unexpected bind targets: %#v", scratch.BindTargets) + } +} + func TestBindHealthyAcceptsScratchSymlinksAndDeviceSubpaths(t *testing.T) { dir := fakeCollectorCommands(t, map[string]string{ "readlink": `case "${2:-}" in diff --git a/pkg/service/app.go b/pkg/service/app.go index 6717e1f..a7bed80 100644 --- a/pkg/service/app.go +++ b/pkg/service/app.go @@ -261,6 +261,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error { return err } a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt) + if err := a.syncScratchAnnotations(record); err != nil { + a.appendEvent(annotationSyncEvent(record.Node, err)) + } a.appendEvent(Event{ Time: record.CollectedAt, Kind: "sentinel.snapshot", @@ -287,6 +290,7 @@ func (a *App) WatchSentinel() (*Event, error) { Containerd: firstLine(snap.Snapshot.Containerd), PackageSample: snap.Snapshot.PackageSample, DropInsSample: snap.Snapshot.DropInsSample, + USBScratch: snap.Snapshot.USBScratch, }) } prevTargets := map[string]facts.Targets{} diff --git a/pkg/service/cluster.go b/pkg/service/cluster.go index 0e58e6f..ac5e420 100644 --- a/pkg/service/cluster.go +++ b/pkg/service/cluster.go @@ -105,6 +105,46 @@ func (k *kubeClient) jsonRequest(method, path string, body any, out any) error { return json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(out) } +func (k *kubeClient) mergePatch(path string, body any) error { + data, err := json.Marshal(body) + if err != nil { + return err + } + req, err := http.NewRequest(http.MethodPatch, k.baseURL+path, bytes.NewReader(data)) + if err != nil { + return err + } + req.Header.Set("Authorization", "Bearer "+k.token) + req.Header.Set("Content-Type", "application/merge-patch+json") + resp, err := k.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode >= 300 { + payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return fmt.Errorf("patch %s failed: %s: %s", path, resp.Status, strings.TrimSpace(string(payload))) + } + return nil +} + +func patchNodeAnnotations(node string, annotations map[string]string) error { + node = strings.TrimSpace(node) + if node == "" || len(annotations) == 0 { + return nil + } + kube, err := kubeClientFactory() + if err != nil { + return err + } + body := map[string]any{ + "metadata": map[string]any{ + "annotations": annotations, + }, + } + return kube.mergePatch("/api/v1/nodes/"+url.PathEscape(node), body) +} + func (k *kubeClient) deleteRequest(path string) error { req, err := http.NewRequest(http.MethodDelete, k.baseURL+path, nil) if err != nil { diff --git a/pkg/service/node_annotations.go b/pkg/service/node_annotations.go new file mode 100644 index 0000000..b0d717d --- /dev/null +++ b/pkg/service/node_annotations.go @@ -0,0 +1,85 @@ +package service + +import ( + "fmt" + "strings" + "time" + + "metis/pkg/facts" +) + +func (a *App) syncScratchAnnotations(record SnapshotRecord) error { + scratch := record.Snapshot.USBScratch + if scratch == nil { + return nil + } + annotations := scratchHealthAnnotations(scratch, record.CollectedAt) + if len(annotations) == 0 { + return nil + } + return patchNodeAnnotations(record.Node, annotations) +} + +func scratchHealthAnnotations(scratch *facts.USBScratch, observedAt time.Time) map[string]string { + status, detail := scratchStatusDetail(scratch) + selector := "" + if scratch.UUID != "" { + selector = "UUID=" + scratch.UUID + } else if scratch.Label != "" { + selector = "LABEL=" + scratch.Label + } + managedPaths := make([]string, 0, len(scratch.BindTargets)) + for _, target := range scratch.BindTargets { + if strings.TrimSpace(target.Path) != "" { + managedPaths = append(managedPaths, target.Path) + } + } + annotations := map[string]string{} + for _, family := range []string{"usb-scratch", "astraios"} { + prefix := "maintenance.bstein.dev/" + family + annotations[prefix+"-status"] = status + annotations[prefix+"-detail"] = detail + annotations[prefix+"-mountpoint"] = scratch.Mountpoint + annotations[prefix+"-managed-paths"] = strings.Join(managedPaths, "_") + annotations[prefix+"-last-observed"] = observedAt.UTC().Format(time.RFC3339) + if selector != "" { + annotations[prefix+"-selector"] = selector + } + } + return annotations +} + +func scratchStatusDetail(scratch *facts.USBScratch) (string, string) { + if scratch == nil { + return "missing", "no-scratch-snapshot" + } + failures := []string{} + if !scratch.MountHealthy { + failures = append(failures, "mount-unhealthy") + } + if scratch.UUID != "" && !scratch.UUIDHealthy { + failures = append(failures, "uuid-mismatch") + } + if scratch.Label != "" && !scratch.LabelHealthy { + failures = append(failures, "label-mismatch") + } + if !scratch.BindHealthy { + failures = append(failures, "bind-mount-incomplete") + } + if len(failures) == 0 { + return "ok", "healthy" + } + return "error", strings.Join(failures, ",") +} + +func annotationSyncEvent(node string, err error) Event { + return Event{ + Time: time.Now().UTC(), + Kind: "sentinel.annotation", + Summary: fmt.Sprintf("Could not sync scratch annotations for %s", node), + Details: map[string]any{ + "node": node, + "error": err.Error(), + }, + } +} diff --git a/pkg/service/node_annotations_test.go b/pkg/service/node_annotations_test.go new file mode 100644 index 0000000..74e7370 --- /dev/null +++ b/pkg/service/node_annotations_test.go @@ -0,0 +1,92 @@ +package service + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "metis/pkg/facts" + "metis/pkg/sentinel" +) + +func TestScratchHealthAnnotations(t *testing.T) { + observed := time.Date(2026, 4, 22, 6, 45, 0, 0, time.UTC) + annotations := scratchHealthAnnotations(&facts.USBScratch{ + Mountpoint: "/mnt/astraios", + UUID: "usb-1", + FS: "ext4", + MountHealthy: true, + UUIDHealthy: true, + BindHealthy: true, + BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}, {Path: "/var/tmp", Healthy: true}}, + }, observed) + if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-detail"] != "healthy" { + t.Fatalf("unexpected healthy annotations: %#v", annotations) + } + if annotations["maintenance.bstein.dev/usb-scratch-selector"] != "UUID=usb-1" { + t.Fatalf("selector annotation missing: %#v", annotations) + } + if annotations["maintenance.bstein.dev/astraios-managed-paths"] != "/var/log/pods_/var/tmp" { + t.Fatalf("managed paths annotation mismatch: %#v", annotations) + } + + status, detail := scratchStatusDetail(&facts.USBScratch{MountHealthy: false, BindHealthy: false}) + if status != "error" || !strings.Contains(detail, "mount-unhealthy") || !strings.Contains(detail, "bind-mount-incomplete") { + t.Fatalf("unexpected unhealthy detail: %s %s", status, detail) + } +} + +func TestStoreSnapshotPatchesNodeAnnotations(t *testing.T) { + var patchPath string + var patchContentType string + var patchBody map[string]any + kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPatch || r.URL.Path != "/api/v1/nodes/titan-04" { + http.NotFound(w, r) + return + } + patchPath = r.URL.Path + patchContentType = r.Header.Get("Content-Type") + if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil { + t.Fatalf("decode patch body: %v", err) + } + _ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"}) + })) + defer kube.Close() + + origFactory := kubeClientFactory + kubeClientFactory = func() (*kubeClient, error) { + return kubeClientFactoryForURL(kube.URL, kube.Client()), nil + } + t.Cleanup(func() { kubeClientFactory = origFactory }) + + app := newTestApp(t) + if err := app.StoreSnapshot(SnapshotRecord{ + Node: "titan-04", + CollectedAt: time.Date(2026, 4, 22, 6, 50, 0, 0, time.UTC), + Snapshot: sentinel.Snapshot{ + Hostname: "titan-04", + USBScratch: &facts.USBScratch{ + Mountpoint: "/mnt/astraios", + UUID: "usb-1", + MountHealthy: true, + UUIDHealthy: true, + BindHealthy: true, + BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}}, + }, + }, + }); err != nil { + t.Fatalf("StoreSnapshot: %v", err) + } + if patchPath != "/api/v1/nodes/titan-04" || patchContentType != "application/merge-patch+json" { + t.Fatalf("patch request mismatch: path=%q content-type=%q", patchPath, patchContentType) + } + metadata := patchBody["metadata"].(map[string]any) + annotations := metadata["annotations"].(map[string]any) + if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-selector"] != "UUID=usb-1" { + t.Fatalf("annotation patch mismatch: %#v", annotations) + } +}