From 6d0351f4b3b4ab5245c9fb723bd6f76476512cd6 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 11 Apr 2026 01:08:08 -0300 Subject: [PATCH] metis: add USB scratch inventory support --- README.md | 6 +- docs/node-classes.md | 4 +- inventory.example.yaml | 26 ++++++ inventory.titan-rpi4.yaml | 7 ++ pkg/config/config.go | 94 ++++++++++++++------ pkg/config/config_test.go | 18 ++++ pkg/config/coverage_more_test.go | 6 +- pkg/facts/aggregate.go | 69 +++++++++++--- pkg/facts/aggregate_test.go | 7 +- pkg/facts/load.go | 1 + pkg/facts/load_test.go | 5 +- pkg/facts/types.go | 48 +++++++--- pkg/inventory/types.go | 25 ++++++ pkg/inventory/types_test.go | 15 ++++ pkg/plan/inject.go | 15 +++- pkg/plan/inject_test.go | 14 ++- pkg/sentinel/collector.go | 148 +++++++++++++++++++++++++++++++ pkg/sentinel/collector_test.go | 28 +++++- 18 files changed, 470 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index c986eff..b7e3df4 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,13 @@ Metis produces fully configured recovery SD cards for any node in the lab (RPi 4 - Cross-platform (Linux + Windows) CLI/GUI with dead-simple UX. - Pull class-specific golden images from Harbor (or other artifact store), inject per-node config, and write/verify SD cards. - Minimal image set via node classes; inject per-node deltas at burn time. -- Idempotent bootstraps: hostname/IP, k3s server/agent setup, labels/taints, journald/log GC drop-ins, Longhorn mount validation, SSH keys/users. +- Idempotent bootstraps: hostname/IP, k3s server/agent setup, labels/taints, journald/log GC drop-ins, Longhorn and USB scratch mount validation, SSH keys/users. - Works offline once artifacts are cached; verifies hashes/signatures before writing. ## Planned high-level workflow 1) Select target node (from inventory) + target disk. 2) Tool downloads/caches the right golden image for that node class. -3) Injects per-node config (net, k3s tokens/roles/labels/taints, SSH keys, runtime drop-ins, Longhorn mount metadata) and writes SD. +3) Injects per-node config (net, k3s tokens/roles/labels/taints, SSH keys, runtime drop-ins, Longhorn mount metadata, USB scratch bind layout) and writes SD. 4) Verifies write; prints next-step: "insert and power on." No manual follow-up. ## Early design notes @@ -45,7 +45,7 @@ Metis produces fully configured recovery SD cards for any node in the lab (RPi 4 - Vault: Metis can read per-node secrets from `secret/data/nodes/` using VAULT_ADDR plus either VAULT_TOKEN or AppRole (VAULT_ROLE_ID/VAULT_SECRET_ID). Expected fields: ssh_password, k3s_token, cloud_init, extra map. - Sentinel: `metis-sentinel` collects host facts and can either print them, write local history, or push them into the Metis service. The intended deployment shape is a DaemonSet on cluster nodes plus an Ariadne-triggered Metis watch that recomputes recommended class targets and drift history. - Facts aggregation: `metis facts --inventory inv.yaml --snapshots ./snapshots` reads sentinel snapshot JSON files and prints per-class drift summary (kernels, containerd, k3s, package samples). Use exported ConfigMaps or `METIS_SENTINEL_OUT` history as input. -- `metis config --inventory inv.yaml --node titan-13` prints the merged node config (hostname/IP/k3s labels/taints/Longhorn UUIDs). +- `metis config --inventory inv.yaml --node titan-13` prints the merged node config (hostname/IP/k3s labels/taints/Longhorn UUIDs and optional USB scratch metadata). ## Service direction - Deployed UI protected by Atlas SSO headers (`admin` / `maintenance`) diff --git a/docs/node-classes.md b/docs/node-classes.md index 9b79cef..1dd3e9b 100644 --- a/docs/node-classes.md +++ b/docs/node-classes.md @@ -5,8 +5,8 @@ Initial classes to minimize golden images while covering hardware/OS deltas: - `rpi5-ubuntu-worker`: Ubuntu 24.04, k3s agent, hardware=rpi5 (titan-04..11, 0a/0c minus control-plane bits) - `rpi5-ubuntu-control`: Ubuntu 24.04, k3s server (titan-0a/0b/0c specifics), control-plane taints, etcd snapshot hooks - `rpi4-armbian-longhorn`: Armbian 6.6.x, k3s agent, hardware=rpi4 with Longhorn disks (titan-13/15/17/19; astreae/asteria mounts) -- `rpi4-armbian-worker`: Armbian 6.6.x, k3s agent, hardware=rpi4 without Longhorn disks (titan-12/14/18) +- `rpi4-armbian-worker`: Armbian 6.6.x, k3s agent, hardware=rpi4 without Longhorn disks; `titan-16` uses the USB scratch recovery card standard - `amd64-agent`: Debian 13 k3s agent with GPU/node labels (titan-22/24, avoid by preference) - `external-hosts`: non-cluster (tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21) – per-host config over base image template -Per-node overlays capture hostname/IP, labels/taints, Longhorn UUID mounts, and drop-ins for logging/GC. +Per-node overlays capture hostname/IP, labels/taints, Longhorn UUID mounts, USB scratch bind targets, and drop-ins for logging/GC. diff --git a/inventory.example.yaml b/inventory.example.yaml index 7845f01..5c78edb 100644 --- a/inventory.example.yaml +++ b/inventory.example.yaml @@ -23,6 +23,17 @@ classes: longhorn: "true" node-role.kubernetes.io/worker: "true" default_taints: [] + - name: rpi4-armbian-worker + arch: arm64 + os: armbian-6.6 + image: https://harbor.bstein.dev/library/rpi4-armbian-worker.img + checksum: sha256:REPLACE_ME + boot_overlay: overlays/rpi4-boot + root_overlay: overlays/rpi4-root + default_labels: + hardware: rpi4 + node-role.kubernetes.io/worker: "true" + default_taints: [] - name: control-plane arch: arm64 os: ubuntu-24.04 @@ -72,6 +83,21 @@ nodes: uuid: cbd4989d-62b5-4741-8b2a-28fdae259cae fs: ext4 ssh_user: root + - name: titan-16 + class: rpi4-armbian-worker + hostname: titan-16 + ip: 192.168.22.44 + k3s_role: agent + labels: + hardware: rpi4 + usb_scratch: + mountpoint: /mnt/scratch + label: titan-16-scratch + fs: ext4 + bind_targets: + - /var/lib/rancher + - /var/log + ssh_user: ubuntu - name: titan-20 class: jetson-accelerator hostname: titan-20 diff --git a/inventory.titan-rpi4.yaml b/inventory.titan-rpi4.yaml index f6d7c5b..499bc17 100644 --- a/inventory.titan-rpi4.yaml +++ b/inventory.titan-rpi4.yaml @@ -165,6 +165,13 @@ nodes: ssh_user: atlas ssh_authorized_keys: - ${METIS_SSH_KEY_BASTION} + usb_scratch: + mountpoint: /mnt/scratch + label: titan-16-scratch + fs: ext4 + bind_targets: + - /var/lib/rancher + - /var/log - name: titan-13 class: rpi4-armbian-longhorn hostname: titan-13 diff --git a/pkg/config/config.go b/pkg/config/config.go index 6c70a80..24bea10 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -8,15 +8,16 @@ import ( // NodeConfig represents boot-time configuration to inject. type NodeConfig struct { - Hostname string `json:"hostname"` - IP string `json:"ip"` - K3s K3sConfig `json:"k3s"` - SSHUser string `json:"ssh_user,omitempty"` - SSHKeys []string `json:"ssh_keys,omitempty"` - Labels map[string]string `json:"labels,omitempty"` - Taints []string `json:"taints,omitempty"` - Fstab []FstabEntry `json:"fstab,omitempty"` - Secrets map[string]string `json:"secrets,omitempty"` // optional key/values for local agent use + Hostname string `json:"hostname"` + IP string `json:"ip"` + K3s K3sConfig `json:"k3s"` + SSHUser string `json:"ssh_user,omitempty"` + SSHKeys []string `json:"ssh_keys,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Taints []string `json:"taints,omitempty"` + Fstab []FstabEntry `json:"fstab,omitempty"` + USBScratch *USBScratchConfig `json:"usb_scratch,omitempty"` + Secrets map[string]string `json:"secrets,omitempty"` // optional key/values for local agent use } // K3sConfig includes role and token/url. @@ -32,12 +33,23 @@ type K3sConfig struct { // FstabEntry for Longhorn or other mounts. type FstabEntry struct { - UUID string `json:"uuid"` + Source string `json:"source,omitempty"` + UUID string `json:"uuid,omitempty"` + Label string `json:"label,omitempty"` Mountpoint string `json:"mountpoint"` FS string `json:"fs"` Options string `json:"options"` } +// USBScratchConfig describes a recovery USB disk and its bind mounts. +type USBScratchConfig struct { + Mountpoint string `json:"mountpoint"` + UUID string `json:"uuid,omitempty"` + Label string `json:"label,omitempty"` + FS string `json:"fs,omitempty"` + BindTargets []string `json:"bind_targets,omitempty"` +} + // Build creates a NodeConfig from inventory. func Build(inv *inventory.Inventory, nodeName string) (*NodeConfig, error) { n, cls, err := inv.FindNode(nodeName) @@ -58,6 +70,23 @@ func Build(inv *inventory.Inventory, nodeName string) (*NodeConfig, error) { k3sVersion = n.K3sVersion } + cfg := &NodeConfig{ + Hostname: n.Hostname, + IP: n.IP, + SSHUser: n.SSHUser, + SSHKeys: n.SSHAuthorized, + Labels: labels, + Taints: taints, + K3s: K3sConfig{ + Role: n.K3sRole, + Version: k3sVersion, + URL: n.K3sURL, + Token: n.K3sToken, + Labels: labels, + Taints: taints, + }, + } + fstab := []FstabEntry{} for _, d := range n.LonghornDisks { fs := d.FS @@ -71,24 +100,35 @@ func Build(inv *inventory.Inventory, nodeName string) (*NodeConfig, error) { Options: "defaults,nofail", }) } - - cfg := &NodeConfig{ - Hostname: n.Hostname, - IP: n.IP, - SSHUser: n.SSHUser, - SSHKeys: n.SSHAuthorized, - Labels: labels, - Taints: taints, - Fstab: fstab, - K3s: K3sConfig{ - Role: n.K3sRole, - Version: k3sVersion, - URL: n.K3sURL, - Token: n.K3sToken, - Labels: labels, - Taints: taints, - }, + if n.USBScratch != nil { + scratch := USBScratchConfig{ + Mountpoint: n.USBScratch.Mountpoint, + UUID: n.USBScratch.UUID, + Label: n.USBScratch.Label, + FS: n.USBScratch.FS, + BindTargets: append([]string{}, n.USBScratch.BindTargets...), + } + if scratch.FS == "" { + scratch.FS = "ext4" + } + cfg.USBScratch = &scratch + fstab = append(fstab, FstabEntry{ + UUID: scratch.UUID, + Label: scratch.Label, + Mountpoint: scratch.Mountpoint, + FS: scratch.FS, + Options: "defaults,nofail", + }) + for _, target := range scratch.BindTargets { + fstab = append(fstab, FstabEntry{ + Source: scratch.Mountpoint, + Mountpoint: target, + FS: "none", + Options: "bind,nofail", + }) + } } + cfg.Fstab = fstab if cfg.Hostname == "" || cfg.IP == "" { return nil, fmt.Errorf("hostname/ip required for node %s", nodeName) } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 216918e..03421cf 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -26,6 +26,12 @@ func TestBuildUsesNodeOverridesAndDefaultFilesystem(t *testing.T) { SSHUser: "atlas", SSHAuthorized: []string{"key"}, LonghornDisks: []inventory.LonghornDisk{{Mountpoint: "/mnt/data", UUID: "uuid-1"}}, + USBScratch: &inventory.USBScratchDisk{ + Mountpoint: "/mnt/scratch", + Label: "titan-13-scratch", + FS: "ext4", + BindTargets: []string{"/var/lib/rancher", "/var/log"}, + }, }}, } cfg, err := Build(&inv, "n1") @@ -38,6 +44,18 @@ func TestBuildUsesNodeOverridesAndDefaultFilesystem(t *testing.T) { if got := cfg.Fstab[0].FS; got != "ext4" { t.Fatalf("expected default filesystem ext4, got %q", got) } + if cfg.USBScratch == nil || cfg.USBScratch.Label != "titan-13-scratch" { + t.Fatalf("usb scratch missing: %#v", cfg.USBScratch) + } + if got := len(cfg.Fstab); got != 4 { + t.Fatalf("expected longhorn plus scratch fstab entries, got %d", got) + } + if got := cfg.Fstab[1].Label; got != "titan-13-scratch" { + t.Fatalf("usb scratch label = %q", got) + } + if got := cfg.Fstab[2].Source; got != "/mnt/scratch" || cfg.Fstab[2].Mountpoint != "/var/lib/rancher" { + t.Fatalf("usb bind mount = %#v", cfg.Fstab[2]) + } if got := cfg.Labels["role"]; got != "worker" { t.Fatalf("label merge lost default label: %q", got) } diff --git a/pkg/config/coverage_more_test.go b/pkg/config/coverage_more_test.go index 95b8599..3301577 100644 --- a/pkg/config/coverage_more_test.go +++ b/pkg/config/coverage_more_test.go @@ -31,15 +31,19 @@ func TestBuildBranches(t *testing.T) { SSHUser: "atlas", SSHAuthorized: []string{"ssh-ed25519 AAA"}, LonghornDisks: []inventory.LonghornDisk{{UUID: "u1", Mountpoint: "/var/lib/longhorn"}}, + USBScratch: &inventory.USBScratchDisk{Mountpoint: "/mnt/scratch", UUID: "usb-1", BindTargets: []string{"/var/lib/rancher"}}, }}, } cfg, err := Build(inv, "titan-15") if err != nil { t.Fatalf("Build: %v", err) } - if cfg.K3s.Version != "v1.31.5+k3s2" || len(cfg.Fstab) != 1 || cfg.Fstab[0].FS != "ext4" { + if cfg.K3s.Version != "v1.31.5+k3s2" || len(cfg.Fstab) != 3 || cfg.Fstab[0].FS != "ext4" { t.Fatalf("unexpected config: %#v", cfg) } + if cfg.USBScratch == nil || cfg.USBScratch.Mountpoint != "/mnt/scratch" { + t.Fatalf("expected usb scratch config: %#v", cfg.USBScratch) + } if _, err := Build(&inventory.Inventory{}, "missing"); err == nil { t.Fatal("expected Build to fail for missing node") } diff --git a/pkg/facts/aggregate.go b/pkg/facts/aggregate.go index 976c769..ea7036d 100644 --- a/pkg/facts/aggregate.go +++ b/pkg/facts/aggregate.go @@ -6,13 +6,17 @@ import ( // ClassSummary captures aggregated sentinel facts per class. type ClassSummary struct { - Class string `json:"class"` - Nodes []string `json:"nodes"` - Kernels map[string]int `json:"kernels,omitempty"` - OSImages map[string]int `json:"os_images,omitempty"` - Containerd map[string]int `json:"containerd,omitempty"` - K3sVersions map[string]int `json:"k3s_versions,omitempty"` - PackageStats map[string]map[string]int `json:"package_stats,omitempty"` // pkg -> version -> count + Class string `json:"class"` + Nodes []string `json:"nodes"` + Kernels map[string]int `json:"kernels,omitempty"` + OSImages map[string]int `json:"os_images,omitempty"` + Containerd map[string]int `json:"containerd,omitempty"` + K3sVersions map[string]int `json:"k3s_versions,omitempty"` + PackageStats map[string]map[string]int `json:"package_stats,omitempty"` // pkg -> version -> count + USBMountHealth map[string]int `json:"usb_mount_health,omitempty"` + USBUUIDHealth map[string]int `json:"usb_uuid_health,omitempty"` + USBLabelHealth map[string]int `json:"usb_label_health,omitempty"` + USBBindHealth map[string]int `json:"usb_bind_health,omitempty"` } // Aggregate groups snapshots by inventory class and tallies version drift. @@ -20,20 +24,28 @@ func Aggregate(inv *inventory.Inventory, snaps []Snapshot) map[string]*ClassSumm result := map[string]*ClassSummary{} for _, s := range snaps { class := "unknown" + var scratch *inventory.USBScratchDisk if inv != nil { - if node, cls, err := inv.FindNode(s.Hostname); err == nil && cls != nil && node != nil { + if node, cls, err := inv.FindNode(s.Hostname); node != nil && cls != nil && err == nil { class = cls.Name + scratch = node.USBScratch + } else if node != nil { + scratch = node.USBScratch } } sum, ok := result[class] if !ok { sum = &ClassSummary{ - Class: class, - Kernels: map[string]int{}, - OSImages: map[string]int{}, - Containerd: map[string]int{}, - K3sVersions: map[string]int{}, - PackageStats: map[string]map[string]int{}, + Class: class, + Kernels: map[string]int{}, + OSImages: map[string]int{}, + Containerd: map[string]int{}, + K3sVersions: map[string]int{}, + PackageStats: map[string]map[string]int{}, + USBMountHealth: map[string]int{}, + USBUUIDHealth: map[string]int{}, + USBLabelHealth: map[string]int{}, + USBBindHealth: map[string]int{}, } result[class] = sum } @@ -58,6 +70,35 @@ func Aggregate(inv *inventory.Inventory, snaps []Snapshot) map[string]*ClassSumm sum.PackageStats[pkg][ver]++ } } + addUSBHealth(sum, scratch, s.USBScratch) } return result } + +func addUSBHealth(sum *ClassSummary, desired *inventory.USBScratchDisk, observed *USBScratch) { + if desired == nil || sum == nil { + return + } + if desired.Mountpoint != "" { + sum.USBMountHealth[usbStatus(observed, observed != nil && observed.MountHealthy)]++ + } + if desired.UUID != "" { + sum.USBUUIDHealth[usbStatus(observed, observed != nil && observed.UUIDHealthy)]++ + } + if desired.Label != "" { + sum.USBLabelHealth[usbStatus(observed, observed != nil && observed.LabelHealthy)]++ + } + if len(desired.BindTargets) > 0 { + sum.USBBindHealth[usbStatus(observed, observed != nil && observed.BindHealthy)]++ + } +} + +func usbStatus(observed *USBScratch, ok bool) string { + if observed == nil { + return "missing" + } + if ok { + return "ok" + } + return "bad" +} diff --git a/pkg/facts/aggregate_test.go b/pkg/facts/aggregate_test.go index 316ad10..325dab4 100644 --- a/pkg/facts/aggregate_test.go +++ b/pkg/facts/aggregate_test.go @@ -10,12 +10,12 @@ func TestAggregateGroupsByClass(t *testing.T) { inv := &inventory.Inventory{ Classes: []inventory.NodeClass{{Name: "c1"}, {Name: "c2"}}, Nodes: []inventory.NodeSpec{ - {Name: "n1", Class: "c1"}, + {Name: "n1", Class: "c1", USBScratch: &inventory.USBScratchDisk{Mountpoint: "/mnt/scratch", Label: "scratch-1", BindTargets: []string{"/var/lib/rancher"}}}, {Name: "n2", Class: "c2"}, }, } snaps := []Snapshot{ - {Hostname: "n1", Kernel: "k1", PackageSample: map[string]string{"containerd": "2.0"}}, + {Hostname: "n1", Kernel: "k1", PackageSample: map[string]string{"containerd": "2.0"}, USBScratch: &USBScratch{Mountpoint: "/mnt/scratch", Label: "scratch-1", MountHealthy: true, LabelHealthy: true, BindHealthy: true, BindTargets: []USBBindTarget{{Path: "/var/lib/rancher", Healthy: true}}}}, {Hostname: "n2", Kernel: "k2", PackageSample: map[string]string{"containerd": "1.7"}}, {Hostname: "n1", Kernel: "k1"}, } @@ -30,6 +30,9 @@ func TestAggregateGroupsByClass(t *testing.T) { if c1.PackageStats["containerd"]["2.0"] != 1 { t.Fatalf("package stats not tallied: %#v", c1.PackageStats) } + if c1.USBMountHealth["ok"] != 1 || c1.USBLabelHealth["ok"] != 1 || c1.USBBindHealth["ok"] != 1 { + t.Fatalf("usb health not tallied: %#v", c1) + } } func TestAggregateKeepsUnknownHostnames(t *testing.T) { diff --git a/pkg/facts/load.go b/pkg/facts/load.go index d01873f..dcd4d34 100644 --- a/pkg/facts/load.go +++ b/pkg/facts/load.go @@ -16,6 +16,7 @@ type Snapshot struct { Containerd string `json:"containerd,omitempty"` PackageSample map[string]string `json:"package_sample,omitempty"` DropInsSample map[string]string `json:"dropins_sample,omitempty"` + USBScratch *USBScratch `json:"usb_scratch,omitempty"` } // LoadDir reads all *.json under a directory and returns snapshots. diff --git a/pkg/facts/load_test.go b/pkg/facts/load_test.go index 2771db5..fbb5cdc 100644 --- a/pkg/facts/load_test.go +++ b/pkg/facts/load_test.go @@ -8,7 +8,7 @@ import ( func TestLoadDirReadsSnapshots(t *testing.T) { dir := t.TempDir() - snap := `{"hostname":"n1","kernel":"k","containerd":"c","package_sample":{"a":"1"}}` + snap := `{"hostname":"n1","kernel":"k","containerd":"c","package_sample":{"a":"1"},"usb_scratch":{"mountpoint":"/mnt/scratch","label":"titan-16-scratch","mount_healthy":true,"bind_targets":[{"path":"/var/lib/rancher","healthy":true}],"bind_healthy":true}}` if err := os.WriteFile(filepath.Join(dir, "snap.json"), []byte(snap), 0o644); err != nil { t.Fatal(err) } @@ -19,6 +19,9 @@ func TestLoadDirReadsSnapshots(t *testing.T) { if len(got) != 1 || got[0].Hostname != "n1" || got[0].PackageSample["a"] != "1" { t.Fatalf("unexpected snapshot: %+v", got) } + if got[0].USBScratch == nil || got[0].USBScratch.Label != "titan-16-scratch" || len(got[0].USBScratch.BindTargets) != 1 { + t.Fatalf("unexpected usb scratch snapshot: %+v", got[0].USBScratch) + } } func TestLoadDirRejectsInvalidJSON(t *testing.T) { diff --git a/pkg/facts/types.go b/pkg/facts/types.go index a5dbb8d..0760f00 100644 --- a/pkg/facts/types.go +++ b/pkg/facts/types.go @@ -1,21 +1,45 @@ package facts +// USBBindTarget captures a bind mount and whether it looked healthy. +type USBBindTarget struct { + Path string `json:"path,omitempty"` + Healthy bool `json:"healthy,omitempty"` +} + +// USBScratch captures the desired scratch-disk configuration plus health. +type USBScratch struct { + Mountpoint string `json:"mountpoint,omitempty"` + UUID string `json:"uuid,omitempty"` + Label string `json:"label,omitempty"` + FS string `json:"fs,omitempty"` + MountHealthy bool `json:"mount_healthy,omitempty"` + UUIDHealthy bool `json:"uuid_healthy,omitempty"` + LabelHealthy bool `json:"label_healthy,omitempty"` + BindTargets []USBBindTarget `json:"bind_targets,omitempty"` + BindHealthy bool `json:"bind_healthy,omitempty"` +} + // ClassFacts captures driftable state collected by metis-sentinel. type ClassFacts struct { - ClassName string `json:"class_name"` - Kernel string `json:"kernel,omitempty"` - K3sVersion string `json:"k3s_version,omitempty"` - Containerd string `json:"containerd,omitempty"` - Packages map[string]string `json:"packages,omitempty"` // name -> version - DropIns map[string]string `json:"dropins,omitempty"` // path -> content - Sysctl map[string]string `json:"sysctl,omitempty"` // key -> value - CGroupConfig map[string]string `json:"cgroup_config,omitempty"` // key -> value - Notes string `json:"notes,omitempty"` + ClassName string `json:"class_name"` + Kernel string `json:"kernel,omitempty"` + K3sVersion string `json:"k3s_version,omitempty"` + Containerd string `json:"containerd,omitempty"` + Packages map[string]string `json:"packages,omitempty"` // name -> version + DropIns map[string]string `json:"dropins,omitempty"` // path -> content + Sysctl map[string]string `json:"sysctl,omitempty"` // key -> value + CGroupConfig map[string]string `json:"cgroup_config,omitempty"` // key -> value + USBMountHealth map[string]int `json:"usb_mount_health,omitempty"` + USBUUIDHealth map[string]int `json:"usb_uuid_health,omitempty"` + USBLabelHealth map[string]int `json:"usb_label_health,omitempty"` + USBBindHealth map[string]int `json:"usb_bind_health,omitempty"` + Notes string `json:"notes,omitempty"` } // NodeFacts captures per-node data (e.g., disk UUIDs) to verify drift. type NodeFacts struct { - Hostname string `json:"hostname"` - Disks map[string]string `json:"disks,omitempty"` // mount -> UUID - Notes string `json:"notes,omitempty"` + Hostname string `json:"hostname"` + Disks map[string]string `json:"disks,omitempty"` // mount -> UUID + USBScratch *USBScratch `json:"usb_scratch,omitempty"` + Notes string `json:"notes,omitempty"` } diff --git a/pkg/inventory/types.go b/pkg/inventory/types.go index b3ffae1..3ced19b 100644 --- a/pkg/inventory/types.go +++ b/pkg/inventory/types.go @@ -44,6 +44,7 @@ type NodeSpec struct { Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty"` Taints []string `yaml:"taints,omitempty" json:"taints,omitempty"` LonghornDisks []LonghornDisk `yaml:"longhorn_disks,omitempty" json:"longhorn_disks,omitempty"` + USBScratch *USBScratchDisk `yaml:"usb_scratch,omitempty" json:"usb_scratch,omitempty"` SSHUser string `yaml:"ssh_user,omitempty" json:"ssh_user,omitempty"` SSHAuthorized []string `yaml:"ssh_authorized_keys,omitempty" json:"ssh_authorized_keys,omitempty"` Notes string `yaml:"notes,omitempty" json:"notes,omitempty"` @@ -56,6 +57,15 @@ type LonghornDisk struct { FS string `yaml:"fs,omitempty" json:"fs,omitempty"` } +// USBScratchDisk describes the recovery USB disk and its bind targets. +type USBScratchDisk struct { + Mountpoint string `yaml:"mountpoint" json:"mountpoint"` + UUID string `yaml:"uuid,omitempty" json:"uuid,omitempty"` + Label string `yaml:"label,omitempty" json:"label,omitempty"` + FS string `yaml:"fs,omitempty" json:"fs,omitempty"` + BindTargets []string `yaml:"bind_targets,omitempty" json:"bind_targets,omitempty"` +} + // Load reads and parses an inventory file. func Load(path string) (*Inventory, error) { data, err := os.ReadFile(path) @@ -121,6 +131,21 @@ func expandInventory(inv *Inventory) { inv.Nodes[idx].LonghornDisks[diskIdx].UUID = os.ExpandEnv(inv.Nodes[idx].LonghornDisks[diskIdx].UUID) inv.Nodes[idx].LonghornDisks[diskIdx].FS = os.ExpandEnv(inv.Nodes[idx].LonghornDisks[diskIdx].FS) } + if inv.Nodes[idx].USBScratch != nil { + inv.Nodes[idx].USBScratch.Mountpoint = os.ExpandEnv(inv.Nodes[idx].USBScratch.Mountpoint) + inv.Nodes[idx].USBScratch.UUID = os.ExpandEnv(inv.Nodes[idx].USBScratch.UUID) + inv.Nodes[idx].USBScratch.Label = os.ExpandEnv(inv.Nodes[idx].USBScratch.Label) + inv.Nodes[idx].USBScratch.FS = os.ExpandEnv(inv.Nodes[idx].USBScratch.FS) + bindTargets := make([]string, 0, len(inv.Nodes[idx].USBScratch.BindTargets)) + for _, value := range inv.Nodes[idx].USBScratch.BindTargets { + expanded := strings.TrimSpace(os.ExpandEnv(value)) + if expanded == "" { + continue + } + bindTargets = append(bindTargets, expanded) + } + inv.Nodes[idx].USBScratch.BindTargets = bindTargets + } } } diff --git a/pkg/inventory/types_test.go b/pkg/inventory/types_test.go index 18831d3..d5aaca4 100644 --- a/pkg/inventory/types_test.go +++ b/pkg/inventory/types_test.go @@ -9,6 +9,9 @@ import ( func TestLoadExpandsEnvironmentVariables(t *testing.T) { t.Setenv("METIS_IMAGE_PATH", "file:///tmp/rpi4.img") t.Setenv("METIS_K3S_TOKEN", "secret-token") + t.Setenv("METIS_USB_MOUNT", "/mnt/usb") + t.Setenv("METIS_USB_LABEL", "titan-13-scratch") + t.Setenv("METIS_USB_BIND", "/var/lib/rancher") invPath := filepath.Join(t.TempDir(), "inventory.yaml") if err := os.WriteFile(invPath, []byte(` classes: @@ -22,6 +25,12 @@ nodes: ip: 192.168.22.41 k3s_role: agent k3s_token: ${METIS_K3S_TOKEN} + usb_scratch: + mountpoint: ${METIS_USB_MOUNT} + label: ${METIS_USB_LABEL} + fs: ext4 + bind_targets: + - ${METIS_USB_BIND} `), 0o644); err != nil { t.Fatal(err) } @@ -40,6 +49,12 @@ nodes: if node.K3sToken != "secret-token" { t.Fatalf("token not expanded: %q", node.K3sToken) } + if node.USBScratch == nil || node.USBScratch.Mountpoint != "/mnt/usb" || node.USBScratch.Label != "titan-13-scratch" { + t.Fatalf("usb scratch not expanded: %#v", node.USBScratch) + } + if len(node.USBScratch.BindTargets) != 1 || node.USBScratch.BindTargets[0] != "/var/lib/rancher" { + t.Fatalf("usb bind target not expanded: %#v", node.USBScratch.BindTargets) + } } func TestFindNodeReturnsClassMissingError(t *testing.T) { diff --git a/pkg/plan/inject.go b/pkg/plan/inject.go index 4de2d9b..c5728e4 100644 --- a/pkg/plan/inject.go +++ b/pkg/plan/inject.go @@ -344,9 +344,20 @@ LinkLocalAddressing=no func fstabAppendContent(cfg *config.NodeConfig) string { var lines []string for _, entry := range cfg.Fstab { + source := entry.Source + switch { + case source != "": + // Use the explicit source path for bind mounts. + case entry.UUID != "": + source = "UUID=" + entry.UUID + case entry.Label != "": + source = "LABEL=" + entry.Label + default: + source = "none" + } lines = append(lines, fmt.Sprintf( - "UUID=%s %s %s %s 0 0", - entry.UUID, + "%s %s %s %s 0 0", + source, entry.Mountpoint, entry.FS, entry.Options, diff --git a/pkg/plan/inject_test.go b/pkg/plan/inject_test.go index df67551..e42ba88 100644 --- a/pkg/plan/inject_test.go +++ b/pkg/plan/inject_test.go @@ -30,6 +30,18 @@ func TestBuildFilesProducesK3sConfig(t *testing.T) { FS: "ext4", Options: "defaults,nofail", }, + { + UUID: "usb-uuid", + Mountpoint: "/mnt/scratch", + FS: "ext4", + Options: "defaults,nofail", + }, + { + Source: "/mnt/scratch", + Mountpoint: "/var/lib/rancher", + FS: "none", + Options: "bind,nofail", + }, }, Labels: map[string]string{"role": "worker", "zone": "a", "node-role.kubernetes.io/worker": "true"}, Taints: []string{"gpu=true:NoSchedule"}, @@ -80,7 +92,7 @@ func TestBuildFilesProducesK3sConfig(t *testing.T) { t.Fatalf("systemd-networkd config missing/incorrect: %s", networkd) } fstab, ok := pathMap["etc/metis/fstab.append"] - if !ok || !strings.Contains(fstab, "UUID=disk-uuid /mnt/astreae ext4 defaults,nofail 0 0") { + if !ok || !strings.Contains(fstab, "UUID=disk-uuid /mnt/astreae ext4 defaults,nofail 0 0") || !strings.Contains(fstab, "UUID=usb-uuid /mnt/scratch ext4 defaults,nofail 0 0") || !strings.Contains(fstab, "/mnt/scratch /var/lib/rancher none bind,nofail 0 0") { t.Fatalf("fstab append missing/incorrect: %s", fstab) } } diff --git a/pkg/sentinel/collector.go b/pkg/sentinel/collector.go index 7c9da8e..42be66f 100644 --- a/pkg/sentinel/collector.go +++ b/pkg/sentinel/collector.go @@ -1,11 +1,26 @@ package sentinel import ( + "encoding/json" "os" "os/exec" "strings" + + "metis/pkg/facts" ) +type nodeConfig struct { + USBScratch *usbScratchConfig `json:"usb_scratch,omitempty"` +} + +type usbScratchConfig struct { + Mountpoint string `json:"mountpoint,omitempty"` + UUID string `json:"uuid,omitempty"` + Label string `json:"label,omitempty"` + FS string `json:"fs,omitempty"` + BindTargets []string `json:"bind_targets,omitempty"` +} + // Snapshot captures host-level facts. type Snapshot struct { Hostname string `json:"hostname,omitempty"` @@ -15,6 +30,7 @@ type Snapshot struct { Containerd string `json:"containerd,omitempty"` PackageSample map[string]string `json:"package_sample,omitempty"` // small subset to detect drift DropInsSample map[string]string `json:"dropins_sample,omitempty"` // path->content hash/sample + USBScratch *facts.USBScratch `json:"usb_scratch,omitempty"` Notes string `json:"notes,omitempty"` } @@ -27,9 +43,76 @@ func Collect() *Snapshot { K3sVersion: runAndTrim("k3s", "version"), Containerd: runAndTrim("containerd", "--version"), PackageSample: pkgSample(), + USBScratch: collectUSBScratch(), } } +func collectUSBScratch() *facts.USBScratch { + raw, err := commandOutput("cat", "/etc/metis/node.json") + if err != nil || len(strings.TrimSpace(string(raw))) == 0 { + return nil + } + var cfg nodeConfig + if err := json.Unmarshal(raw, &cfg); err != nil || cfg.USBScratch == nil { + return nil + } + desired := cfg.USBScratch + scratch := &facts.USBScratch{ + Mountpoint: desired.Mountpoint, + UUID: desired.UUID, + Label: desired.Label, + FS: desired.FS, + } + + source, fsType, mounted := mountInfo(desired.Mountpoint) + scratch.MountHealthy = mounted && strings.TrimSpace(source) != "" + if scratch.MountHealthy && desired.FS != "" && fsType != "" { + scratch.MountHealthy = strings.EqualFold(fsType, desired.FS) + } + if scratch.FS == "" && fsType != "" { + scratch.FS = fsType + } + + device := source + if device == "" && desired.UUID != "" { + device = resolveDeviceByUUID(desired.UUID) + } + if device == "" && desired.Label != "" { + device = resolveDeviceByLabel(desired.Label) + } + if device != "" { + export := blkidExport(device) + if desired.UUID != "" { + scratch.UUIDHealthy = export["UUID"] == desired.UUID + } + if desired.Label != "" { + scratch.LabelHealthy = export["LABEL"] == desired.Label + } + if scratch.FS == "" { + scratch.FS = export["TYPE"] + } + } + + healthy := true + if len(desired.BindTargets) > 0 { + scratch.BindTargets = make([]facts.USBBindTarget, 0, len(desired.BindTargets)) + for _, target := range desired.BindTargets { + ok := bindHealthy(target, desired.Mountpoint) + if !ok { + healthy = false + } + scratch.BindTargets = append(scratch.BindTargets, facts.USBBindTarget{ + Path: target, + Healthy: ok, + }) + } + scratch.BindHealthy = healthy + } else { + scratch.BindHealthy = true + } + return scratch +} + func runAndTrim(cmd string, args ...string) string { out, err := commandOutput(cmd, args...) if err != nil { @@ -78,6 +161,71 @@ func pkgVersion(name string) string { return "" } +func mountInfo(target string) (string, string, bool) { + target = strings.TrimSpace(target) + if target == "" { + return "", "", false + } + out, err := commandOutput("findmnt", "-P", "-n", "-T", target, "-o", "SOURCE,TARGET,FSTYPE") + if err != nil { + return "", "", false + } + fields := parseKeyValues(string(out)) + source := fields["SOURCE"] + fsType := fields["FSTYPE"] + return source, fsType, strings.TrimSpace(fields["TARGET"]) == target +} + +func bindHealthy(target, source string) bool { + target = strings.TrimSpace(target) + source = strings.TrimSpace(source) + if target == "" || source == "" { + return false + } + mountSource, _, mounted := mountInfo(target) + return mounted && strings.TrimSpace(mountSource) == source +} + +func resolveDeviceByUUID(uuid string) string { + uuid = strings.TrimSpace(uuid) + if uuid == "" { + return "" + } + return runAndTrim("blkid", "-U", uuid) +} + +func resolveDeviceByLabel(label string) string { + label = strings.TrimSpace(label) + if label == "" { + return "" + } + return runAndTrim("blkid", "-L", label) +} + +func blkidExport(device string) map[string]string { + device = strings.TrimSpace(device) + if device == "" { + return map[string]string{} + } + out, err := commandOutput("blkid", "-o", "export", device) + if err != nil { + return map[string]string{} + } + return parseKeyValues(string(out)) +} + +func parseKeyValues(out string) map[string]string { + result := map[string]string{} + for _, field := range strings.Fields(strings.TrimSpace(out)) { + key, value, ok := strings.Cut(field, "=") + if !ok { + continue + } + result[key] = strings.Trim(value, `"`) + } + return result +} + func commandOutput(cmd string, args ...string) ([]byte, error) { if os.Getenv("METIS_SENTINEL_NSENTER") == "1" { nsenterArgs := []string{"-t", "1", "-m", "-u", "-n", "-i", "-p", "--", cmd} diff --git a/pkg/sentinel/collector_test.go b/pkg/sentinel/collector_test.go index 4f67b41..c3129d6 100644 --- a/pkg/sentinel/collector_test.go +++ b/pkg/sentinel/collector_test.go @@ -21,6 +21,9 @@ func TestCollectUsesCommandOutputAndPkgSample(t *testing.T) { if len(snap.PackageSample) != 4 || snap.PackageSample["k3s"] != "v1.31.5+k3s1" { t.Fatalf("unexpected package sample: %+v", snap.PackageSample) } + if snap.USBScratch == nil || snap.USBScratch.Label != "titan-16-scratch" || !snap.USBScratch.MountHealthy || !snap.USBScratch.LabelHealthy || !snap.USBScratch.BindHealthy { + t.Fatalf("unexpected usb scratch sample: %+v", snap.USBScratch) + } } func TestCommandOutputUsesNsenterWhenRequested(t *testing.T) { @@ -73,7 +76,30 @@ func fakeSentinelCommands(t *testing.T) string { write("uname", `printf '6.6.63\n'`) write("k3s", `printf 'v1.31.5+k3s1\n'`) write("containerd", `printf '1.7.99\n'`) - write("cat", `printf 'PRETTY_NAME="Metis OS"\n'`) + write("cat", `case "${1:-}" in + /etc/os-release) printf 'PRETTY_NAME="Metis OS"\n' ;; + /etc/metis/node.json) printf '%s\n' '{"usb_scratch":{"mountpoint":"/mnt/scratch","label":"titan-16-scratch","fs":"ext4","bind_targets":["/var/lib/rancher","/var/log"]}}' ;; + *) printf 'PRETTY_NAME="Metis OS"\n' ;; +esac`) + write("findmnt", `target="" +for ((i=1; i<=$#; i++)); do + if [[ "${!i}" == "-T" ]]; then + j=$((i + 1)) + target="${!j}" + break + fi +done +case "${target}" in + /mnt/scratch) printf 'SOURCE="/dev/sdz1" TARGET="/mnt/scratch" FSTYPE="ext4"\n' ;; + /var/lib/rancher) printf 'SOURCE="/mnt/scratch" TARGET="/var/lib/rancher" FSTYPE="none"\n' ;; + /var/log) printf 'SOURCE="/mnt/scratch" TARGET="/var/log" FSTYPE="none"\n' ;; + *) exit 1 ;; +esac`) + write("blkid", `case "${1:-}" in + -U) printf '/dev/sdz1\n' ;; + -L) printf '/dev/sdz1\n' ;; + -o) printf 'UUID=titan-16-uuid\nLABEL=titan-16-scratch\nTYPE=ext4\n' ;; +esac`) write("dpkg-query", `case "${@: -1}" in containerd) printf '1.7.99\n' ;; k3s) printf 'v1.31.5+k3s1\n' ;;