fix(metis): publish scratch health from sentinels
This commit is contained in:
parent
77d34e4f1d
commit
c972f226da
@ -49,12 +49,8 @@ func Collect() *Snapshot {
|
||||
}
|
||||
|
||||
func collectUSBScratch() *facts.USBScratch {
|
||||
raw, err := commandOutput("cat", "/etc/metis/node.json")
|
||||
if err != nil || len(strings.TrimSpace(string(raw))) == 0 {
|
||||
return nil
|
||||
}
|
||||
var cfg nodeConfig
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil || cfg.USBScratch == nil {
|
||||
cfg, ok := loadNodeConfig()
|
||||
if !ok || cfg.USBScratch == nil {
|
||||
return nil
|
||||
}
|
||||
desired := cfg.USBScratch
|
||||
@ -114,6 +110,75 @@ func collectUSBScratch() *facts.USBScratch {
|
||||
return scratch
|
||||
}
|
||||
|
||||
func loadNodeConfig() (nodeConfig, bool) {
|
||||
raw, err := commandOutput("cat", "/etc/metis/node.json")
|
||||
if err == nil && len(strings.TrimSpace(string(raw))) > 0 {
|
||||
var cfg nodeConfig
|
||||
if err := json.Unmarshal(raw, &cfg); err == nil && cfg.USBScratch != nil {
|
||||
return cfg, true
|
||||
}
|
||||
}
|
||||
raw, err = commandOutput("cat", "/etc/fstab")
|
||||
if err != nil {
|
||||
return nodeConfig{}, false
|
||||
}
|
||||
if scratch, ok := parseUSBScratchFstab(string(raw)); ok {
|
||||
return nodeConfig{USBScratch: scratch}, true
|
||||
}
|
||||
return nodeConfig{}, false
|
||||
}
|
||||
|
||||
func parseUSBScratchFstab(raw string) (*usbScratchConfig, bool) {
|
||||
cfg := &usbScratchConfig{}
|
||||
inBlock := false
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
switch {
|
||||
case strings.HasPrefix(line, "# BEGIN maintenance.bstein.dev usb-scratch"):
|
||||
inBlock = true
|
||||
continue
|
||||
case strings.HasPrefix(line, "# END maintenance.bstein.dev usb-scratch"):
|
||||
inBlock = false
|
||||
continue
|
||||
case !inBlock || line == "" || strings.HasPrefix(line, "#"):
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
source, target, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||
if fsType == "none" && hasFstabOption(options, "bind") {
|
||||
cfg.BindTargets = append(cfg.BindTargets, target)
|
||||
continue
|
||||
}
|
||||
if fsType == "tmpfs" || strings.EqualFold(source, "tmpfs") {
|
||||
continue
|
||||
}
|
||||
cfg.Mountpoint = target
|
||||
cfg.FS = fsType
|
||||
if value, ok := strings.CutPrefix(source, "UUID="); ok {
|
||||
cfg.UUID = value
|
||||
}
|
||||
if value, ok := strings.CutPrefix(source, "LABEL="); ok {
|
||||
cfg.Label = value
|
||||
}
|
||||
}
|
||||
if cfg.Mountpoint == "" {
|
||||
return nil, false
|
||||
}
|
||||
return cfg, true
|
||||
}
|
||||
|
||||
func hasFstabOption(options, want string) bool {
|
||||
for _, option := range strings.Split(options, ",") {
|
||||
if option == want {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func runAndTrim(cmd string, args ...string) string {
|
||||
out, err := commandOutput(cmd, args...)
|
||||
if err != nil {
|
||||
|
||||
@ -120,6 +120,61 @@ esac`,
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectUSBScratchFallsBackToManagedFstabBlock(t *testing.T) {
|
||||
dir := fakeCollectorCommands(t, map[string]string{
|
||||
"cat": `case "${1:-}" in
|
||||
/etc/metis/node.json) exit 1 ;;
|
||||
/etc/fstab) printf '%s\n' '# BEGIN maintenance.bstein.dev usb-scratch' \
|
||||
'UUID=usb-1 /mnt/astraios ext4 defaults,noatime 0 2' \
|
||||
'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0' \
|
||||
'/mnt/astraios/var/log/pods /var/log/pods none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \
|
||||
'/mnt/astraios/var/tmp /var/tmp none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \
|
||||
'# END maintenance.bstein.dev usb-scratch'
|
||||
;;
|
||||
*) exit 1 ;;
|
||||
esac`,
|
||||
"findmnt": `target=""
|
||||
for ((i=1; i<=$#; i++)); do
|
||||
if [[ "${!i}" == "-T" ]]; then
|
||||
j=$((i + 1))
|
||||
target="${!j}"
|
||||
break
|
||||
fi
|
||||
done
|
||||
case "${target}" in
|
||||
/mnt/astraios) printf 'SOURCE="/dev/sda1" TARGET="/mnt/astraios" FSTYPE="ext4"\n' ;;
|
||||
/var/log/pods) printf 'SOURCE="/dev/sda1[/var/log/pods]" TARGET="/var/log/pods" FSTYPE="none"\n' ;;
|
||||
/var/tmp) printf 'SOURCE="/dev/sda1[/var/tmp]" TARGET="/var/tmp" FSTYPE="none"\n' ;;
|
||||
*) exit 1 ;;
|
||||
esac`,
|
||||
"readlink": `case "${2:-}" in
|
||||
/mnt/astraios) printf '/mnt/astraios\n' ;;
|
||||
/var/log/pods) printf '/mnt/astraios/var/log/pods\n' ;;
|
||||
/var/tmp) printf '/mnt/astraios/var/tmp\n' ;;
|
||||
*) exit 1 ;;
|
||||
esac`,
|
||||
"blkid": `case "${1:-}" in
|
||||
-U) printf '/dev/sda1\n' ;;
|
||||
-o) printf 'UUID=usb-1\nTYPE=ext4\n' ;;
|
||||
esac`,
|
||||
})
|
||||
t.Setenv("PATH", dir+string(os.PathListSeparator)+os.Getenv("PATH"))
|
||||
|
||||
scratch := collectUSBScratch()
|
||||
if scratch == nil {
|
||||
t.Fatal("expected USB scratch data from fstab")
|
||||
}
|
||||
if scratch.Mountpoint != "/mnt/astraios" || scratch.UUID != "usb-1" || scratch.FS != "ext4" {
|
||||
t.Fatalf("unexpected scratch identity: %#v", scratch)
|
||||
}
|
||||
if !scratch.MountHealthy || !scratch.UUIDHealthy || !scratch.BindHealthy {
|
||||
t.Fatalf("expected fstab-derived scratch to be healthy: %#v", scratch)
|
||||
}
|
||||
if len(scratch.BindTargets) != 2 || scratch.BindTargets[0].Path != "/var/log/pods" || scratch.BindTargets[1].Path != "/var/tmp" {
|
||||
t.Fatalf("unexpected bind targets: %#v", scratch.BindTargets)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBindHealthyAcceptsScratchSymlinksAndDeviceSubpaths(t *testing.T) {
|
||||
dir := fakeCollectorCommands(t, map[string]string{
|
||||
"readlink": `case "${2:-}" in
|
||||
|
||||
@ -261,6 +261,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
|
||||
return err
|
||||
}
|
||||
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
|
||||
if err := a.syncScratchAnnotations(record); err != nil {
|
||||
a.appendEvent(annotationSyncEvent(record.Node, err))
|
||||
}
|
||||
a.appendEvent(Event{
|
||||
Time: record.CollectedAt,
|
||||
Kind: "sentinel.snapshot",
|
||||
@ -287,6 +290,7 @@ func (a *App) WatchSentinel() (*Event, error) {
|
||||
Containerd: firstLine(snap.Snapshot.Containerd),
|
||||
PackageSample: snap.Snapshot.PackageSample,
|
||||
DropInsSample: snap.Snapshot.DropInsSample,
|
||||
USBScratch: snap.Snapshot.USBScratch,
|
||||
})
|
||||
}
|
||||
prevTargets := map[string]facts.Targets{}
|
||||
|
||||
@ -105,6 +105,46 @@ func (k *kubeClient) jsonRequest(method, path string, body any, out any) error {
|
||||
return json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(out)
|
||||
}
|
||||
|
||||
func (k *kubeClient) mergePatch(path string, body any) error {
|
||||
data, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req, err := http.NewRequest(http.MethodPatch, k.baseURL+path, bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+k.token)
|
||||
req.Header.Set("Content-Type", "application/merge-patch+json")
|
||||
resp, err := k.client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return fmt.Errorf("patch %s failed: %s: %s", path, resp.Status, strings.TrimSpace(string(payload)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func patchNodeAnnotations(node string, annotations map[string]string) error {
|
||||
node = strings.TrimSpace(node)
|
||||
if node == "" || len(annotations) == 0 {
|
||||
return nil
|
||||
}
|
||||
kube, err := kubeClientFactory()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body := map[string]any{
|
||||
"metadata": map[string]any{
|
||||
"annotations": annotations,
|
||||
},
|
||||
}
|
||||
return kube.mergePatch("/api/v1/nodes/"+url.PathEscape(node), body)
|
||||
}
|
||||
|
||||
func (k *kubeClient) deleteRequest(path string) error {
|
||||
req, err := http.NewRequest(http.MethodDelete, k.baseURL+path, nil)
|
||||
if err != nil {
|
||||
|
||||
85
pkg/service/node_annotations.go
Normal file
85
pkg/service/node_annotations.go
Normal file
@ -0,0 +1,85 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"metis/pkg/facts"
|
||||
)
|
||||
|
||||
func (a *App) syncScratchAnnotations(record SnapshotRecord) error {
|
||||
scratch := record.Snapshot.USBScratch
|
||||
if scratch == nil {
|
||||
return nil
|
||||
}
|
||||
annotations := scratchHealthAnnotations(scratch, record.CollectedAt)
|
||||
if len(annotations) == 0 {
|
||||
return nil
|
||||
}
|
||||
return patchNodeAnnotations(record.Node, annotations)
|
||||
}
|
||||
|
||||
func scratchHealthAnnotations(scratch *facts.USBScratch, observedAt time.Time) map[string]string {
|
||||
status, detail := scratchStatusDetail(scratch)
|
||||
selector := ""
|
||||
if scratch.UUID != "" {
|
||||
selector = "UUID=" + scratch.UUID
|
||||
} else if scratch.Label != "" {
|
||||
selector = "LABEL=" + scratch.Label
|
||||
}
|
||||
managedPaths := make([]string, 0, len(scratch.BindTargets))
|
||||
for _, target := range scratch.BindTargets {
|
||||
if strings.TrimSpace(target.Path) != "" {
|
||||
managedPaths = append(managedPaths, target.Path)
|
||||
}
|
||||
}
|
||||
annotations := map[string]string{}
|
||||
for _, family := range []string{"usb-scratch", "astraios"} {
|
||||
prefix := "maintenance.bstein.dev/" + family
|
||||
annotations[prefix+"-status"] = status
|
||||
annotations[prefix+"-detail"] = detail
|
||||
annotations[prefix+"-mountpoint"] = scratch.Mountpoint
|
||||
annotations[prefix+"-managed-paths"] = strings.Join(managedPaths, "_")
|
||||
annotations[prefix+"-last-observed"] = observedAt.UTC().Format(time.RFC3339)
|
||||
if selector != "" {
|
||||
annotations[prefix+"-selector"] = selector
|
||||
}
|
||||
}
|
||||
return annotations
|
||||
}
|
||||
|
||||
func scratchStatusDetail(scratch *facts.USBScratch) (string, string) {
|
||||
if scratch == nil {
|
||||
return "missing", "no-scratch-snapshot"
|
||||
}
|
||||
failures := []string{}
|
||||
if !scratch.MountHealthy {
|
||||
failures = append(failures, "mount-unhealthy")
|
||||
}
|
||||
if scratch.UUID != "" && !scratch.UUIDHealthy {
|
||||
failures = append(failures, "uuid-mismatch")
|
||||
}
|
||||
if scratch.Label != "" && !scratch.LabelHealthy {
|
||||
failures = append(failures, "label-mismatch")
|
||||
}
|
||||
if !scratch.BindHealthy {
|
||||
failures = append(failures, "bind-mount-incomplete")
|
||||
}
|
||||
if len(failures) == 0 {
|
||||
return "ok", "healthy"
|
||||
}
|
||||
return "error", strings.Join(failures, ",")
|
||||
}
|
||||
|
||||
func annotationSyncEvent(node string, err error) Event {
|
||||
return Event{
|
||||
Time: time.Now().UTC(),
|
||||
Kind: "sentinel.annotation",
|
||||
Summary: fmt.Sprintf("Could not sync scratch annotations for %s", node),
|
||||
Details: map[string]any{
|
||||
"node": node,
|
||||
"error": err.Error(),
|
||||
},
|
||||
}
|
||||
}
|
||||
92
pkg/service/node_annotations_test.go
Normal file
92
pkg/service/node_annotations_test.go
Normal file
@ -0,0 +1,92 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"metis/pkg/facts"
|
||||
"metis/pkg/sentinel"
|
||||
)
|
||||
|
||||
func TestScratchHealthAnnotations(t *testing.T) {
|
||||
observed := time.Date(2026, 4, 22, 6, 45, 0, 0, time.UTC)
|
||||
annotations := scratchHealthAnnotations(&facts.USBScratch{
|
||||
Mountpoint: "/mnt/astraios",
|
||||
UUID: "usb-1",
|
||||
FS: "ext4",
|
||||
MountHealthy: true,
|
||||
UUIDHealthy: true,
|
||||
BindHealthy: true,
|
||||
BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}, {Path: "/var/tmp", Healthy: true}},
|
||||
}, observed)
|
||||
if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-detail"] != "healthy" {
|
||||
t.Fatalf("unexpected healthy annotations: %#v", annotations)
|
||||
}
|
||||
if annotations["maintenance.bstein.dev/usb-scratch-selector"] != "UUID=usb-1" {
|
||||
t.Fatalf("selector annotation missing: %#v", annotations)
|
||||
}
|
||||
if annotations["maintenance.bstein.dev/astraios-managed-paths"] != "/var/log/pods_/var/tmp" {
|
||||
t.Fatalf("managed paths annotation mismatch: %#v", annotations)
|
||||
}
|
||||
|
||||
status, detail := scratchStatusDetail(&facts.USBScratch{MountHealthy: false, BindHealthy: false})
|
||||
if status != "error" || !strings.Contains(detail, "mount-unhealthy") || !strings.Contains(detail, "bind-mount-incomplete") {
|
||||
t.Fatalf("unexpected unhealthy detail: %s %s", status, detail)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStoreSnapshotPatchesNodeAnnotations(t *testing.T) {
|
||||
var patchPath string
|
||||
var patchContentType string
|
||||
var patchBody map[string]any
|
||||
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPatch || r.URL.Path != "/api/v1/nodes/titan-04" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
patchPath = r.URL.Path
|
||||
patchContentType = r.Header.Get("Content-Type")
|
||||
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
|
||||
t.Fatalf("decode patch body: %v", err)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
|
||||
}))
|
||||
defer kube.Close()
|
||||
|
||||
origFactory := kubeClientFactory
|
||||
kubeClientFactory = func() (*kubeClient, error) {
|
||||
return kubeClientFactoryForURL(kube.URL, kube.Client()), nil
|
||||
}
|
||||
t.Cleanup(func() { kubeClientFactory = origFactory })
|
||||
|
||||
app := newTestApp(t)
|
||||
if err := app.StoreSnapshot(SnapshotRecord{
|
||||
Node: "titan-04",
|
||||
CollectedAt: time.Date(2026, 4, 22, 6, 50, 0, 0, time.UTC),
|
||||
Snapshot: sentinel.Snapshot{
|
||||
Hostname: "titan-04",
|
||||
USBScratch: &facts.USBScratch{
|
||||
Mountpoint: "/mnt/astraios",
|
||||
UUID: "usb-1",
|
||||
MountHealthy: true,
|
||||
UUIDHealthy: true,
|
||||
BindHealthy: true,
|
||||
BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}},
|
||||
},
|
||||
},
|
||||
}); err != nil {
|
||||
t.Fatalf("StoreSnapshot: %v", err)
|
||||
}
|
||||
if patchPath != "/api/v1/nodes/titan-04" || patchContentType != "application/merge-patch+json" {
|
||||
t.Fatalf("patch request mismatch: path=%q content-type=%q", patchPath, patchContentType)
|
||||
}
|
||||
metadata := patchBody["metadata"].(map[string]any)
|
||||
annotations := metadata["annotations"].(map[string]any)
|
||||
if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-selector"] != "UUID=usb-1" {
|
||||
t.Fatalf("annotation patch mismatch: %#v", annotations)
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user