fix(metis): publish scratch health from sentinels

This commit is contained in:
codex 2026-04-22 03:57:55 -03:00
parent 77d34e4f1d
commit c972f226da
6 changed files with 347 additions and 6 deletions

View File

@ -49,12 +49,8 @@ func Collect() *Snapshot {
}
func collectUSBScratch() *facts.USBScratch {
raw, err := commandOutput("cat", "/etc/metis/node.json")
if err != nil || len(strings.TrimSpace(string(raw))) == 0 {
return nil
}
var cfg nodeConfig
if err := json.Unmarshal(raw, &cfg); err != nil || cfg.USBScratch == nil {
cfg, ok := loadNodeConfig()
if !ok || cfg.USBScratch == nil {
return nil
}
desired := cfg.USBScratch
@ -114,6 +110,75 @@ func collectUSBScratch() *facts.USBScratch {
return scratch
}
func loadNodeConfig() (nodeConfig, bool) {
raw, err := commandOutput("cat", "/etc/metis/node.json")
if err == nil && len(strings.TrimSpace(string(raw))) > 0 {
var cfg nodeConfig
if err := json.Unmarshal(raw, &cfg); err == nil && cfg.USBScratch != nil {
return cfg, true
}
}
raw, err = commandOutput("cat", "/etc/fstab")
if err != nil {
return nodeConfig{}, false
}
if scratch, ok := parseUSBScratchFstab(string(raw)); ok {
return nodeConfig{USBScratch: scratch}, true
}
return nodeConfig{}, false
}
func parseUSBScratchFstab(raw string) (*usbScratchConfig, bool) {
cfg := &usbScratchConfig{}
inBlock := false
for _, line := range strings.Split(raw, "\n") {
line = strings.TrimSpace(line)
switch {
case strings.HasPrefix(line, "# BEGIN maintenance.bstein.dev usb-scratch"):
inBlock = true
continue
case strings.HasPrefix(line, "# END maintenance.bstein.dev usb-scratch"):
inBlock = false
continue
case !inBlock || line == "" || strings.HasPrefix(line, "#"):
continue
}
fields := strings.Fields(line)
if len(fields) < 4 {
continue
}
source, target, fsType, options := fields[0], fields[1], fields[2], fields[3]
if fsType == "none" && hasFstabOption(options, "bind") {
cfg.BindTargets = append(cfg.BindTargets, target)
continue
}
if fsType == "tmpfs" || strings.EqualFold(source, "tmpfs") {
continue
}
cfg.Mountpoint = target
cfg.FS = fsType
if value, ok := strings.CutPrefix(source, "UUID="); ok {
cfg.UUID = value
}
if value, ok := strings.CutPrefix(source, "LABEL="); ok {
cfg.Label = value
}
}
if cfg.Mountpoint == "" {
return nil, false
}
return cfg, true
}
func hasFstabOption(options, want string) bool {
for _, option := range strings.Split(options, ",") {
if option == want {
return true
}
}
return false
}
func runAndTrim(cmd string, args ...string) string {
out, err := commandOutput(cmd, args...)
if err != nil {

View File

@ -120,6 +120,61 @@ esac`,
}
}
func TestCollectUSBScratchFallsBackToManagedFstabBlock(t *testing.T) {
dir := fakeCollectorCommands(t, map[string]string{
"cat": `case "${1:-}" in
/etc/metis/node.json) exit 1 ;;
/etc/fstab) printf '%s\n' '# BEGIN maintenance.bstein.dev usb-scratch' \
'UUID=usb-1 /mnt/astraios ext4 defaults,noatime 0 2' \
'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0' \
'/mnt/astraios/var/log/pods /var/log/pods none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \
'/mnt/astraios/var/tmp /var/tmp none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \
'# END maintenance.bstein.dev usb-scratch'
;;
*) exit 1 ;;
esac`,
"findmnt": `target=""
for ((i=1; i<=$#; i++)); do
if [[ "${!i}" == "-T" ]]; then
j=$((i + 1))
target="${!j}"
break
fi
done
case "${target}" in
/mnt/astraios) printf 'SOURCE="/dev/sda1" TARGET="/mnt/astraios" FSTYPE="ext4"\n' ;;
/var/log/pods) printf 'SOURCE="/dev/sda1[/var/log/pods]" TARGET="/var/log/pods" FSTYPE="none"\n' ;;
/var/tmp) printf 'SOURCE="/dev/sda1[/var/tmp]" TARGET="/var/tmp" FSTYPE="none"\n' ;;
*) exit 1 ;;
esac`,
"readlink": `case "${2:-}" in
/mnt/astraios) printf '/mnt/astraios\n' ;;
/var/log/pods) printf '/mnt/astraios/var/log/pods\n' ;;
/var/tmp) printf '/mnt/astraios/var/tmp\n' ;;
*) exit 1 ;;
esac`,
"blkid": `case "${1:-}" in
-U) printf '/dev/sda1\n' ;;
-o) printf 'UUID=usb-1\nTYPE=ext4\n' ;;
esac`,
})
t.Setenv("PATH", dir+string(os.PathListSeparator)+os.Getenv("PATH"))
scratch := collectUSBScratch()
if scratch == nil {
t.Fatal("expected USB scratch data from fstab")
}
if scratch.Mountpoint != "/mnt/astraios" || scratch.UUID != "usb-1" || scratch.FS != "ext4" {
t.Fatalf("unexpected scratch identity: %#v", scratch)
}
if !scratch.MountHealthy || !scratch.UUIDHealthy || !scratch.BindHealthy {
t.Fatalf("expected fstab-derived scratch to be healthy: %#v", scratch)
}
if len(scratch.BindTargets) != 2 || scratch.BindTargets[0].Path != "/var/log/pods" || scratch.BindTargets[1].Path != "/var/tmp" {
t.Fatalf("unexpected bind targets: %#v", scratch.BindTargets)
}
}
func TestBindHealthyAcceptsScratchSymlinksAndDeviceSubpaths(t *testing.T) {
dir := fakeCollectorCommands(t, map[string]string{
"readlink": `case "${2:-}" in

View File

@ -261,6 +261,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
return err
}
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
if err := a.syncScratchAnnotations(record); err != nil {
a.appendEvent(annotationSyncEvent(record.Node, err))
}
a.appendEvent(Event{
Time: record.CollectedAt,
Kind: "sentinel.snapshot",
@ -287,6 +290,7 @@ func (a *App) WatchSentinel() (*Event, error) {
Containerd: firstLine(snap.Snapshot.Containerd),
PackageSample: snap.Snapshot.PackageSample,
DropInsSample: snap.Snapshot.DropInsSample,
USBScratch: snap.Snapshot.USBScratch,
})
}
prevTargets := map[string]facts.Targets{}

View File

@ -105,6 +105,46 @@ func (k *kubeClient) jsonRequest(method, path string, body any, out any) error {
return json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(out)
}
func (k *kubeClient) mergePatch(path string, body any) error {
data, err := json.Marshal(body)
if err != nil {
return err
}
req, err := http.NewRequest(http.MethodPatch, k.baseURL+path, bytes.NewReader(data))
if err != nil {
return err
}
req.Header.Set("Authorization", "Bearer "+k.token)
req.Header.Set("Content-Type", "application/merge-patch+json")
resp, err := k.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return fmt.Errorf("patch %s failed: %s: %s", path, resp.Status, strings.TrimSpace(string(payload)))
}
return nil
}
func patchNodeAnnotations(node string, annotations map[string]string) error {
node = strings.TrimSpace(node)
if node == "" || len(annotations) == 0 {
return nil
}
kube, err := kubeClientFactory()
if err != nil {
return err
}
body := map[string]any{
"metadata": map[string]any{
"annotations": annotations,
},
}
return kube.mergePatch("/api/v1/nodes/"+url.PathEscape(node), body)
}
func (k *kubeClient) deleteRequest(path string) error {
req, err := http.NewRequest(http.MethodDelete, k.baseURL+path, nil)
if err != nil {

View File

@ -0,0 +1,85 @@
package service
import (
"fmt"
"strings"
"time"
"metis/pkg/facts"
)
func (a *App) syncScratchAnnotations(record SnapshotRecord) error {
scratch := record.Snapshot.USBScratch
if scratch == nil {
return nil
}
annotations := scratchHealthAnnotations(scratch, record.CollectedAt)
if len(annotations) == 0 {
return nil
}
return patchNodeAnnotations(record.Node, annotations)
}
func scratchHealthAnnotations(scratch *facts.USBScratch, observedAt time.Time) map[string]string {
status, detail := scratchStatusDetail(scratch)
selector := ""
if scratch.UUID != "" {
selector = "UUID=" + scratch.UUID
} else if scratch.Label != "" {
selector = "LABEL=" + scratch.Label
}
managedPaths := make([]string, 0, len(scratch.BindTargets))
for _, target := range scratch.BindTargets {
if strings.TrimSpace(target.Path) != "" {
managedPaths = append(managedPaths, target.Path)
}
}
annotations := map[string]string{}
for _, family := range []string{"usb-scratch", "astraios"} {
prefix := "maintenance.bstein.dev/" + family
annotations[prefix+"-status"] = status
annotations[prefix+"-detail"] = detail
annotations[prefix+"-mountpoint"] = scratch.Mountpoint
annotations[prefix+"-managed-paths"] = strings.Join(managedPaths, "_")
annotations[prefix+"-last-observed"] = observedAt.UTC().Format(time.RFC3339)
if selector != "" {
annotations[prefix+"-selector"] = selector
}
}
return annotations
}
func scratchStatusDetail(scratch *facts.USBScratch) (string, string) {
if scratch == nil {
return "missing", "no-scratch-snapshot"
}
failures := []string{}
if !scratch.MountHealthy {
failures = append(failures, "mount-unhealthy")
}
if scratch.UUID != "" && !scratch.UUIDHealthy {
failures = append(failures, "uuid-mismatch")
}
if scratch.Label != "" && !scratch.LabelHealthy {
failures = append(failures, "label-mismatch")
}
if !scratch.BindHealthy {
failures = append(failures, "bind-mount-incomplete")
}
if len(failures) == 0 {
return "ok", "healthy"
}
return "error", strings.Join(failures, ",")
}
func annotationSyncEvent(node string, err error) Event {
return Event{
Time: time.Now().UTC(),
Kind: "sentinel.annotation",
Summary: fmt.Sprintf("Could not sync scratch annotations for %s", node),
Details: map[string]any{
"node": node,
"error": err.Error(),
},
}
}

View File

@ -0,0 +1,92 @@
package service
import (
"encoding/json"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"metis/pkg/facts"
"metis/pkg/sentinel"
)
func TestScratchHealthAnnotations(t *testing.T) {
observed := time.Date(2026, 4, 22, 6, 45, 0, 0, time.UTC)
annotations := scratchHealthAnnotations(&facts.USBScratch{
Mountpoint: "/mnt/astraios",
UUID: "usb-1",
FS: "ext4",
MountHealthy: true,
UUIDHealthy: true,
BindHealthy: true,
BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}, {Path: "/var/tmp", Healthy: true}},
}, observed)
if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-detail"] != "healthy" {
t.Fatalf("unexpected healthy annotations: %#v", annotations)
}
if annotations["maintenance.bstein.dev/usb-scratch-selector"] != "UUID=usb-1" {
t.Fatalf("selector annotation missing: %#v", annotations)
}
if annotations["maintenance.bstein.dev/astraios-managed-paths"] != "/var/log/pods_/var/tmp" {
t.Fatalf("managed paths annotation mismatch: %#v", annotations)
}
status, detail := scratchStatusDetail(&facts.USBScratch{MountHealthy: false, BindHealthy: false})
if status != "error" || !strings.Contains(detail, "mount-unhealthy") || !strings.Contains(detail, "bind-mount-incomplete") {
t.Fatalf("unexpected unhealthy detail: %s %s", status, detail)
}
}
func TestStoreSnapshotPatchesNodeAnnotations(t *testing.T) {
var patchPath string
var patchContentType string
var patchBody map[string]any
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPatch || r.URL.Path != "/api/v1/nodes/titan-04" {
http.NotFound(w, r)
return
}
patchPath = r.URL.Path
patchContentType = r.Header.Get("Content-Type")
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
t.Fatalf("decode patch body: %v", err)
}
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
}))
defer kube.Close()
origFactory := kubeClientFactory
kubeClientFactory = func() (*kubeClient, error) {
return kubeClientFactoryForURL(kube.URL, kube.Client()), nil
}
t.Cleanup(func() { kubeClientFactory = origFactory })
app := newTestApp(t)
if err := app.StoreSnapshot(SnapshotRecord{
Node: "titan-04",
CollectedAt: time.Date(2026, 4, 22, 6, 50, 0, 0, time.UTC),
Snapshot: sentinel.Snapshot{
Hostname: "titan-04",
USBScratch: &facts.USBScratch{
Mountpoint: "/mnt/astraios",
UUID: "usb-1",
MountHealthy: true,
UUIDHealthy: true,
BindHealthy: true,
BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}},
},
},
}); err != nil {
t.Fatalf("StoreSnapshot: %v", err)
}
if patchPath != "/api/v1/nodes/titan-04" || patchContentType != "application/merge-patch+json" {
t.Fatalf("patch request mismatch: path=%q content-type=%q", patchPath, patchContentType)
}
metadata := patchBody["metadata"].(map[string]any)
annotations := metadata["annotations"].(map[string]any)
if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-selector"] != "UUID=usb-1" {
t.Fatalf("annotation patch mismatch: %#v", annotations)
}
}