fix(metis): publish scratch health from sentinels
This commit is contained in:
parent
77d34e4f1d
commit
c972f226da
@ -49,12 +49,8 @@ func Collect() *Snapshot {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func collectUSBScratch() *facts.USBScratch {
|
func collectUSBScratch() *facts.USBScratch {
|
||||||
raw, err := commandOutput("cat", "/etc/metis/node.json")
|
cfg, ok := loadNodeConfig()
|
||||||
if err != nil || len(strings.TrimSpace(string(raw))) == 0 {
|
if !ok || cfg.USBScratch == nil {
|
||||||
return nil
|
|
||||||
}
|
|
||||||
var cfg nodeConfig
|
|
||||||
if err := json.Unmarshal(raw, &cfg); err != nil || cfg.USBScratch == nil {
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
desired := cfg.USBScratch
|
desired := cfg.USBScratch
|
||||||
@ -114,6 +110,75 @@ func collectUSBScratch() *facts.USBScratch {
|
|||||||
return scratch
|
return scratch
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadNodeConfig() (nodeConfig, bool) {
|
||||||
|
raw, err := commandOutput("cat", "/etc/metis/node.json")
|
||||||
|
if err == nil && len(strings.TrimSpace(string(raw))) > 0 {
|
||||||
|
var cfg nodeConfig
|
||||||
|
if err := json.Unmarshal(raw, &cfg); err == nil && cfg.USBScratch != nil {
|
||||||
|
return cfg, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
raw, err = commandOutput("cat", "/etc/fstab")
|
||||||
|
if err != nil {
|
||||||
|
return nodeConfig{}, false
|
||||||
|
}
|
||||||
|
if scratch, ok := parseUSBScratchFstab(string(raw)); ok {
|
||||||
|
return nodeConfig{USBScratch: scratch}, true
|
||||||
|
}
|
||||||
|
return nodeConfig{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseUSBScratchFstab(raw string) (*usbScratchConfig, bool) {
|
||||||
|
cfg := &usbScratchConfig{}
|
||||||
|
inBlock := false
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(line, "# BEGIN maintenance.bstein.dev usb-scratch"):
|
||||||
|
inBlock = true
|
||||||
|
continue
|
||||||
|
case strings.HasPrefix(line, "# END maintenance.bstein.dev usb-scratch"):
|
||||||
|
inBlock = false
|
||||||
|
continue
|
||||||
|
case !inBlock || line == "" || strings.HasPrefix(line, "#"):
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 4 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
source, target, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||||
|
if fsType == "none" && hasFstabOption(options, "bind") {
|
||||||
|
cfg.BindTargets = append(cfg.BindTargets, target)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fsType == "tmpfs" || strings.EqualFold(source, "tmpfs") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cfg.Mountpoint = target
|
||||||
|
cfg.FS = fsType
|
||||||
|
if value, ok := strings.CutPrefix(source, "UUID="); ok {
|
||||||
|
cfg.UUID = value
|
||||||
|
}
|
||||||
|
if value, ok := strings.CutPrefix(source, "LABEL="); ok {
|
||||||
|
cfg.Label = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cfg.Mountpoint == "" {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
return cfg, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasFstabOption(options, want string) bool {
|
||||||
|
for _, option := range strings.Split(options, ",") {
|
||||||
|
if option == want {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func runAndTrim(cmd string, args ...string) string {
|
func runAndTrim(cmd string, args ...string) string {
|
||||||
out, err := commandOutput(cmd, args...)
|
out, err := commandOutput(cmd, args...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -120,6 +120,61 @@ esac`,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCollectUSBScratchFallsBackToManagedFstabBlock(t *testing.T) {
|
||||||
|
dir := fakeCollectorCommands(t, map[string]string{
|
||||||
|
"cat": `case "${1:-}" in
|
||||||
|
/etc/metis/node.json) exit 1 ;;
|
||||||
|
/etc/fstab) printf '%s\n' '# BEGIN maintenance.bstein.dev usb-scratch' \
|
||||||
|
'UUID=usb-1 /mnt/astraios ext4 defaults,noatime 0 2' \
|
||||||
|
'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0' \
|
||||||
|
'/mnt/astraios/var/log/pods /var/log/pods none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \
|
||||||
|
'/mnt/astraios/var/tmp /var/tmp none bind,x-systemd.requires-mounts-for=/mnt/astraios 0 0' \
|
||||||
|
'# END maintenance.bstein.dev usb-scratch'
|
||||||
|
;;
|
||||||
|
*) exit 1 ;;
|
||||||
|
esac`,
|
||||||
|
"findmnt": `target=""
|
||||||
|
for ((i=1; i<=$#; i++)); do
|
||||||
|
if [[ "${!i}" == "-T" ]]; then
|
||||||
|
j=$((i + 1))
|
||||||
|
target="${!j}"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
case "${target}" in
|
||||||
|
/mnt/astraios) printf 'SOURCE="/dev/sda1" TARGET="/mnt/astraios" FSTYPE="ext4"\n' ;;
|
||||||
|
/var/log/pods) printf 'SOURCE="/dev/sda1[/var/log/pods]" TARGET="/var/log/pods" FSTYPE="none"\n' ;;
|
||||||
|
/var/tmp) printf 'SOURCE="/dev/sda1[/var/tmp]" TARGET="/var/tmp" FSTYPE="none"\n' ;;
|
||||||
|
*) exit 1 ;;
|
||||||
|
esac`,
|
||||||
|
"readlink": `case "${2:-}" in
|
||||||
|
/mnt/astraios) printf '/mnt/astraios\n' ;;
|
||||||
|
/var/log/pods) printf '/mnt/astraios/var/log/pods\n' ;;
|
||||||
|
/var/tmp) printf '/mnt/astraios/var/tmp\n' ;;
|
||||||
|
*) exit 1 ;;
|
||||||
|
esac`,
|
||||||
|
"blkid": `case "${1:-}" in
|
||||||
|
-U) printf '/dev/sda1\n' ;;
|
||||||
|
-o) printf 'UUID=usb-1\nTYPE=ext4\n' ;;
|
||||||
|
esac`,
|
||||||
|
})
|
||||||
|
t.Setenv("PATH", dir+string(os.PathListSeparator)+os.Getenv("PATH"))
|
||||||
|
|
||||||
|
scratch := collectUSBScratch()
|
||||||
|
if scratch == nil {
|
||||||
|
t.Fatal("expected USB scratch data from fstab")
|
||||||
|
}
|
||||||
|
if scratch.Mountpoint != "/mnt/astraios" || scratch.UUID != "usb-1" || scratch.FS != "ext4" {
|
||||||
|
t.Fatalf("unexpected scratch identity: %#v", scratch)
|
||||||
|
}
|
||||||
|
if !scratch.MountHealthy || !scratch.UUIDHealthy || !scratch.BindHealthy {
|
||||||
|
t.Fatalf("expected fstab-derived scratch to be healthy: %#v", scratch)
|
||||||
|
}
|
||||||
|
if len(scratch.BindTargets) != 2 || scratch.BindTargets[0].Path != "/var/log/pods" || scratch.BindTargets[1].Path != "/var/tmp" {
|
||||||
|
t.Fatalf("unexpected bind targets: %#v", scratch.BindTargets)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBindHealthyAcceptsScratchSymlinksAndDeviceSubpaths(t *testing.T) {
|
func TestBindHealthyAcceptsScratchSymlinksAndDeviceSubpaths(t *testing.T) {
|
||||||
dir := fakeCollectorCommands(t, map[string]string{
|
dir := fakeCollectorCommands(t, map[string]string{
|
||||||
"readlink": `case "${2:-}" in
|
"readlink": `case "${2:-}" in
|
||||||
|
|||||||
@ -261,6 +261,9 @@ func (a *App) StoreSnapshot(record SnapshotRecord) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
|
a.metrics.RecordSnapshot(record.Node, "ok", record.CollectedAt)
|
||||||
|
if err := a.syncScratchAnnotations(record); err != nil {
|
||||||
|
a.appendEvent(annotationSyncEvent(record.Node, err))
|
||||||
|
}
|
||||||
a.appendEvent(Event{
|
a.appendEvent(Event{
|
||||||
Time: record.CollectedAt,
|
Time: record.CollectedAt,
|
||||||
Kind: "sentinel.snapshot",
|
Kind: "sentinel.snapshot",
|
||||||
@ -287,6 +290,7 @@ func (a *App) WatchSentinel() (*Event, error) {
|
|||||||
Containerd: firstLine(snap.Snapshot.Containerd),
|
Containerd: firstLine(snap.Snapshot.Containerd),
|
||||||
PackageSample: snap.Snapshot.PackageSample,
|
PackageSample: snap.Snapshot.PackageSample,
|
||||||
DropInsSample: snap.Snapshot.DropInsSample,
|
DropInsSample: snap.Snapshot.DropInsSample,
|
||||||
|
USBScratch: snap.Snapshot.USBScratch,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
prevTargets := map[string]facts.Targets{}
|
prevTargets := map[string]facts.Targets{}
|
||||||
|
|||||||
@ -105,6 +105,46 @@ func (k *kubeClient) jsonRequest(method, path string, body any, out any) error {
|
|||||||
return json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(out)
|
return json.NewDecoder(io.LimitReader(resp.Body, 1<<20)).Decode(out)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (k *kubeClient) mergePatch(path string, body any) error {
|
||||||
|
data, err := json.Marshal(body)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
req, err := http.NewRequest(http.MethodPatch, k.baseURL+path, bytes.NewReader(data))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
req.Header.Set("Authorization", "Bearer "+k.token)
|
||||||
|
req.Header.Set("Content-Type", "application/merge-patch+json")
|
||||||
|
resp, err := k.client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode >= 300 {
|
||||||
|
payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||||
|
return fmt.Errorf("patch %s failed: %s: %s", path, resp.Status, strings.TrimSpace(string(payload)))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func patchNodeAnnotations(node string, annotations map[string]string) error {
|
||||||
|
node = strings.TrimSpace(node)
|
||||||
|
if node == "" || len(annotations) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
kube, err := kubeClientFactory()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
body := map[string]any{
|
||||||
|
"metadata": map[string]any{
|
||||||
|
"annotations": annotations,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return kube.mergePatch("/api/v1/nodes/"+url.PathEscape(node), body)
|
||||||
|
}
|
||||||
|
|
||||||
func (k *kubeClient) deleteRequest(path string) error {
|
func (k *kubeClient) deleteRequest(path string) error {
|
||||||
req, err := http.NewRequest(http.MethodDelete, k.baseURL+path, nil)
|
req, err := http.NewRequest(http.MethodDelete, k.baseURL+path, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
85
pkg/service/node_annotations.go
Normal file
85
pkg/service/node_annotations.go
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"metis/pkg/facts"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) syncScratchAnnotations(record SnapshotRecord) error {
|
||||||
|
scratch := record.Snapshot.USBScratch
|
||||||
|
if scratch == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
annotations := scratchHealthAnnotations(scratch, record.CollectedAt)
|
||||||
|
if len(annotations) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return patchNodeAnnotations(record.Node, annotations)
|
||||||
|
}
|
||||||
|
|
||||||
|
func scratchHealthAnnotations(scratch *facts.USBScratch, observedAt time.Time) map[string]string {
|
||||||
|
status, detail := scratchStatusDetail(scratch)
|
||||||
|
selector := ""
|
||||||
|
if scratch.UUID != "" {
|
||||||
|
selector = "UUID=" + scratch.UUID
|
||||||
|
} else if scratch.Label != "" {
|
||||||
|
selector = "LABEL=" + scratch.Label
|
||||||
|
}
|
||||||
|
managedPaths := make([]string, 0, len(scratch.BindTargets))
|
||||||
|
for _, target := range scratch.BindTargets {
|
||||||
|
if strings.TrimSpace(target.Path) != "" {
|
||||||
|
managedPaths = append(managedPaths, target.Path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
annotations := map[string]string{}
|
||||||
|
for _, family := range []string{"usb-scratch", "astraios"} {
|
||||||
|
prefix := "maintenance.bstein.dev/" + family
|
||||||
|
annotations[prefix+"-status"] = status
|
||||||
|
annotations[prefix+"-detail"] = detail
|
||||||
|
annotations[prefix+"-mountpoint"] = scratch.Mountpoint
|
||||||
|
annotations[prefix+"-managed-paths"] = strings.Join(managedPaths, "_")
|
||||||
|
annotations[prefix+"-last-observed"] = observedAt.UTC().Format(time.RFC3339)
|
||||||
|
if selector != "" {
|
||||||
|
annotations[prefix+"-selector"] = selector
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return annotations
|
||||||
|
}
|
||||||
|
|
||||||
|
func scratchStatusDetail(scratch *facts.USBScratch) (string, string) {
|
||||||
|
if scratch == nil {
|
||||||
|
return "missing", "no-scratch-snapshot"
|
||||||
|
}
|
||||||
|
failures := []string{}
|
||||||
|
if !scratch.MountHealthy {
|
||||||
|
failures = append(failures, "mount-unhealthy")
|
||||||
|
}
|
||||||
|
if scratch.UUID != "" && !scratch.UUIDHealthy {
|
||||||
|
failures = append(failures, "uuid-mismatch")
|
||||||
|
}
|
||||||
|
if scratch.Label != "" && !scratch.LabelHealthy {
|
||||||
|
failures = append(failures, "label-mismatch")
|
||||||
|
}
|
||||||
|
if !scratch.BindHealthy {
|
||||||
|
failures = append(failures, "bind-mount-incomplete")
|
||||||
|
}
|
||||||
|
if len(failures) == 0 {
|
||||||
|
return "ok", "healthy"
|
||||||
|
}
|
||||||
|
return "error", strings.Join(failures, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
func annotationSyncEvent(node string, err error) Event {
|
||||||
|
return Event{
|
||||||
|
Time: time.Now().UTC(),
|
||||||
|
Kind: "sentinel.annotation",
|
||||||
|
Summary: fmt.Sprintf("Could not sync scratch annotations for %s", node),
|
||||||
|
Details: map[string]any{
|
||||||
|
"node": node,
|
||||||
|
"error": err.Error(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
92
pkg/service/node_annotations_test.go
Normal file
92
pkg/service/node_annotations_test.go
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"metis/pkg/facts"
|
||||||
|
"metis/pkg/sentinel"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestScratchHealthAnnotations(t *testing.T) {
|
||||||
|
observed := time.Date(2026, 4, 22, 6, 45, 0, 0, time.UTC)
|
||||||
|
annotations := scratchHealthAnnotations(&facts.USBScratch{
|
||||||
|
Mountpoint: "/mnt/astraios",
|
||||||
|
UUID: "usb-1",
|
||||||
|
FS: "ext4",
|
||||||
|
MountHealthy: true,
|
||||||
|
UUIDHealthy: true,
|
||||||
|
BindHealthy: true,
|
||||||
|
BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}, {Path: "/var/tmp", Healthy: true}},
|
||||||
|
}, observed)
|
||||||
|
if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-detail"] != "healthy" {
|
||||||
|
t.Fatalf("unexpected healthy annotations: %#v", annotations)
|
||||||
|
}
|
||||||
|
if annotations["maintenance.bstein.dev/usb-scratch-selector"] != "UUID=usb-1" {
|
||||||
|
t.Fatalf("selector annotation missing: %#v", annotations)
|
||||||
|
}
|
||||||
|
if annotations["maintenance.bstein.dev/astraios-managed-paths"] != "/var/log/pods_/var/tmp" {
|
||||||
|
t.Fatalf("managed paths annotation mismatch: %#v", annotations)
|
||||||
|
}
|
||||||
|
|
||||||
|
status, detail := scratchStatusDetail(&facts.USBScratch{MountHealthy: false, BindHealthy: false})
|
||||||
|
if status != "error" || !strings.Contains(detail, "mount-unhealthy") || !strings.Contains(detail, "bind-mount-incomplete") {
|
||||||
|
t.Fatalf("unexpected unhealthy detail: %s %s", status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStoreSnapshotPatchesNodeAnnotations(t *testing.T) {
|
||||||
|
var patchPath string
|
||||||
|
var patchContentType string
|
||||||
|
var patchBody map[string]any
|
||||||
|
kube := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodPatch || r.URL.Path != "/api/v1/nodes/titan-04" {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
patchPath = r.URL.Path
|
||||||
|
patchContentType = r.Header.Get("Content-Type")
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&patchBody); err != nil {
|
||||||
|
t.Fatalf("decode patch body: %v", err)
|
||||||
|
}
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]any{"status": "ok"})
|
||||||
|
}))
|
||||||
|
defer kube.Close()
|
||||||
|
|
||||||
|
origFactory := kubeClientFactory
|
||||||
|
kubeClientFactory = func() (*kubeClient, error) {
|
||||||
|
return kubeClientFactoryForURL(kube.URL, kube.Client()), nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { kubeClientFactory = origFactory })
|
||||||
|
|
||||||
|
app := newTestApp(t)
|
||||||
|
if err := app.StoreSnapshot(SnapshotRecord{
|
||||||
|
Node: "titan-04",
|
||||||
|
CollectedAt: time.Date(2026, 4, 22, 6, 50, 0, 0, time.UTC),
|
||||||
|
Snapshot: sentinel.Snapshot{
|
||||||
|
Hostname: "titan-04",
|
||||||
|
USBScratch: &facts.USBScratch{
|
||||||
|
Mountpoint: "/mnt/astraios",
|
||||||
|
UUID: "usb-1",
|
||||||
|
MountHealthy: true,
|
||||||
|
UUIDHealthy: true,
|
||||||
|
BindHealthy: true,
|
||||||
|
BindTargets: []facts.USBBindTarget{{Path: "/var/log/pods", Healthy: true}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("StoreSnapshot: %v", err)
|
||||||
|
}
|
||||||
|
if patchPath != "/api/v1/nodes/titan-04" || patchContentType != "application/merge-patch+json" {
|
||||||
|
t.Fatalf("patch request mismatch: path=%q content-type=%q", patchPath, patchContentType)
|
||||||
|
}
|
||||||
|
metadata := patchBody["metadata"].(map[string]any)
|
||||||
|
annotations := metadata["annotations"].(map[string]any)
|
||||||
|
if annotations["maintenance.bstein.dev/usb-scratch-status"] != "ok" || annotations["maintenance.bstein.dev/astraios-selector"] != "UUID=usb-1" {
|
||||||
|
t.Fatalf("annotation patch mismatch: %#v", annotations)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user