From d105e43e49000c45911fa297c218f00d14ddf7c3 Mon Sep 17 00:00:00 2001 From: codex Date: Tue, 5 May 2026 13:24:25 -0300 Subject: [PATCH] recovery(ananke): auto-heal sealed vault and dead-node drift --- configs/ananke.example.yaml | 2 + configs/ananke.tethys.yaml | 2 + configs/ananke.titan-db.yaml | 2 + internal/cluster/orchestrator_autorepair.go | 288 +++++++++++++ .../orchestrator_autorepair_cleanup_test.go | 296 ++++++++++++++ .../cluster/orchestrator_autorepair_test.go | 382 ++++++++++++++++++ internal/config/apply_defaults.go | 6 + internal/config/types.go | 2 + internal/config/validate.go | 6 + internal/config/validate_matrix_test.go | 5 + internal/service/daemon.go | 51 +++ .../daemon_poststart_autorepair_test.go | 51 +++ testing/hygiene/in_tree_test_allowlist.txt | 3 + 13 files changed, 1096 insertions(+) create mode 100644 internal/cluster/orchestrator_autorepair.go create mode 100644 internal/cluster/orchestrator_autorepair_cleanup_test.go create mode 100644 internal/cluster/orchestrator_autorepair_test.go create mode 100644 internal/service/daemon_poststart_autorepair_test.go diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml index 5ea1867..77b525e 100644 --- a/configs/ananke.example.yaml +++ b/configs/ananke.example.yaml @@ -154,6 +154,8 @@ startup: scheduling_storm_event_threshold: 30 scheduling_storm_window_seconds: 180 stuck_pod_grace_seconds: 180 + post_start_auto_heal_seconds: 60 + dead_node_cleanup_grace_seconds: 300 vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_breakglass_command: "" vault_unseal_breakglass_timeout_seconds: 15 diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index 2588d51..f322bb2 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -286,6 +286,8 @@ startup: scheduling_storm_event_threshold: 30 scheduling_storm_window_seconds: 180 stuck_pod_grace_seconds: 180 + post_start_auto_heal_seconds: 60 + dead_node_cleanup_grace_seconds: 300 vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/tethys/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'" vault_unseal_breakglass_timeout_seconds: 15 diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index 0cab587..1cc86f8 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -286,6 +286,8 @@ startup: scheduling_storm_event_threshold: 30 scheduling_storm_window_seconds: 180 stuck_pod_grace_seconds: 180 + post_start_auto_heal_seconds: 60 + dead_node_cleanup_grace_seconds: 300 vault_unseal_key_file: /var/lib/ananke/vault-unseal.key vault_unseal_breakglass_command: "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -i /home/atlas/.ssh/id_ed25519 -p 1122 brad@99.183.132.163 'cat ~/.ananke-breakglass/vault-unseal.key'" vault_unseal_breakglass_timeout_seconds: 15 diff --git a/internal/cluster/orchestrator_autorepair.go b/internal/cluster/orchestrator_autorepair.go new file mode 100644 index 0000000..17ef6fa --- /dev/null +++ b/internal/cluster/orchestrator_autorepair.go @@ -0,0 +1,288 @@ +package cluster + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "time" +) + +type nodeReadyList struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + } `json:"metadata"` + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + } `json:"status"` + } `json:"items"` +} + +type podDeleteList struct { + Items []struct { + Metadata struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + DeletionTimestamp *time.Time `json:"deletionTimestamp"` + } `json:"metadata"` + Spec struct { + NodeName string `json:"nodeName"` + } `json:"spec"` + } `json:"items"` +} + +// RunPostStartAutoHeal runs one orchestration or CLI step. +// Signature: (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error. +// Why: gives the long-running daemon a narrow, testable repair entrypoint for +// post-start drift without rerunning the full startup flow. +func (o *Orchestrator) RunPostStartAutoHeal(ctx context.Context) error { + return o.postStartAutoHeal(ctx) +} + +// postStartAutoHeal runs one orchestration or CLI step. +// Signature: (o *Orchestrator) postStartAutoHeal(ctx context.Context) error. +// Why: centralizes bounded post-start repair actions so recurring outage +// patterns only trigger the specific remediation they need. +func (o *Orchestrator) postStartAutoHeal(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + + errs := []string{} + requestReconcile := false + + if err := o.ensureRequiredNodeLabels(ctx); err != nil { + errs = append(errs, fmt.Sprintf("required node labels: %v", err)) + } + + vaultRecovered, err := o.autoRecoverSealedVault(ctx) + if err != nil { + errs = append(errs, fmt.Sprintf("vault auto-recovery: %v", err)) + } else if vaultRecovered { + requestReconcile = true + if err := o.rerunVaultK8sAuthConfigJob(ctx); err != nil { + errs = append(errs, fmt.Sprintf("vault k8s auth config rerun: %v", err)) + } + } + + cleaned, err := o.cleanupTerminatingPodsOnUnavailableNodes(ctx) + if err != nil { + errs = append(errs, fmt.Sprintf("dead-node terminating pod cleanup: %v", err)) + } else if cleaned > 0 { + requestReconcile = true + } + + if requestReconcile { + o.bestEffort("request flux reconcile after post-start auto-heal", func() error { + return o.requestFluxReconcile(ctx) + }) + } + + if len(errs) > 0 { + return errors.New(strings.Join(errs, "; ")) + } + return nil +} + +// autoRecoverSealedVault runs one orchestration or CLI step. +// Signature: (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error). +// Why: lets the daemon repair a later Vault reseal without waiting for a new +// bootstrap run. +func (o *Orchestrator) autoRecoverSealedVault(ctx context.Context) (bool, error) { + if o.runner.DryRun { + return false, nil + } + + phase, err := o.kubectl(ctx, 15*time.Second, "-n", "vault", "get", "pod", "vault-0", "-o", "jsonpath={.status.phase}") + if err != nil { + if isNotFoundErr(err) { + return false, nil + } + return false, fmt.Errorf("vault pod phase check failed: %w", err) + } + if strings.TrimSpace(phase) != "Running" { + return false, nil + } + + sealed, err := o.vaultSealed(ctx) + if err != nil { + return false, err + } + if !sealed { + return false, nil + } + + o.log.Printf("warning: detected sealed Vault after startup; attempting post-start auto-recovery") + if err := o.ensureVaultUnsealed(ctx); err != nil { + return false, err + } + return true, nil +} + +// rerunVaultK8sAuthConfigJob runs one orchestration or CLI step. +// Signature: (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error. +// Why: post-unseal Vault recovery needs the auth-config job retriggered so +// downstream secret consumers stop carrying stale failures from the sealed window. +func (o *Orchestrator) rerunVaultK8sAuthConfigJob(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + jobName := fmt.Sprintf("vault-k8s-auth-config-autoheal-%d", time.Now().Unix()) + if _, err := o.kubectl( + ctx, + 25*time.Second, + "-n", "vault", + "create", "job", + "--from=cronjob/vault-k8s-auth-config", + jobName, + ); err != nil { + return fmt.Errorf("create job %s: %w", jobName, err) + } + o.log.Printf("triggered vault k8s auth config job %s after vault recovery", jobName) + return nil +} + +// cleanupTerminatingPodsOnUnavailableNodes runs one orchestration or CLI step. +// Signature: (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error). +// Why: dead nodes can strand terminating pods indefinitely, so the daemon should +// clear only that narrow failure class instead of leaving garbage behind forever. +func (o *Orchestrator) cleanupTerminatingPodsOnUnavailableNodes(ctx context.Context) (int, error) { + if o.runner.DryRun { + return 0, nil + } + + unavailable, err := o.unavailableNodeSet(ctx) + if err != nil { + return 0, err + } + if len(unavailable) == 0 { + return 0, nil + } + + out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") + if err != nil { + return 0, fmt.Errorf("query pods: %w", err) + } + var pods podDeleteList + if err := json.Unmarshal([]byte(out), &pods); err != nil { + return 0, fmt.Errorf("decode pods: %w", err) + } + + grace := time.Duration(o.cfg.Startup.DeadNodeCleanupGraceSeconds) * time.Second + now := time.Now() + count := 0 + for _, item := range pods.Items { + if item.Metadata.DeletionTimestamp == nil || item.Spec.NodeName == "" { + continue + } + if _, badNode := unavailable[item.Spec.NodeName]; !badNode { + continue + } + if now.Sub(*item.Metadata.DeletionTimestamp) < grace { + continue + } + o.log.Printf("warning: force deleting terminating pod %s/%s on unavailable node %s", item.Metadata.Namespace, item.Metadata.Name, item.Spec.NodeName) + if _, err := o.kubectl( + ctx, + 20*time.Second, + "-n", item.Metadata.Namespace, + "delete", "pod", item.Metadata.Name, + "--grace-period=0", + "--force", + "--wait=false", + ); err != nil && !isNotFoundErr(err) { + return count, fmt.Errorf("delete pod %s/%s: %w", item.Metadata.Namespace, item.Metadata.Name, err) + } + count++ + } + if count > 0 { + o.log.Printf("post-start auto-heal cleaned %d terminating pod(s) from unavailable nodes", count) + } + return count, nil +} + +// unavailableNodeSet runs one orchestration or CLI step. +// Signature: (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error). +// Why: isolates Ready-condition parsing so dead-node cleanup stays targeted. +func (o *Orchestrator) unavailableNodeSet(ctx context.Context) (map[string]struct{}, error) { + out, err := o.kubectl(ctx, 20*time.Second, "get", "nodes", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query nodes: %w", err) + } + var nodes nodeReadyList + if err := json.Unmarshal([]byte(out), &nodes); err != nil { + return nil, fmt.Errorf("decode nodes: %w", err) + } + + unavailable := map[string]struct{}{} + for _, item := range nodes.Items { + ready := "" + for _, cond := range item.Status.Conditions { + if strings.EqualFold(strings.TrimSpace(cond.Type), "Ready") { + ready = strings.TrimSpace(cond.Status) + break + } + } + if ready != "True" { + unavailable[item.Metadata.Name] = struct{}{} + } + } + return unavailable, nil +} + +// requestFluxReconcile runs one orchestration or CLI step. +// Signature: (o *Orchestrator) requestFluxReconcile(ctx context.Context) error. +// Why: post-start repairs need a lightweight way to refresh GitOps health +// without reusing the broader startup flux-resume flow. +func (o *Orchestrator) requestFluxReconcile(ctx context.Context) error { + if o.runner.DryRun { + return nil + } + + now := time.Now().UTC().Format(time.RFC3339) + if _, err := o.kubectl( + ctx, + 25*time.Second, + "-n", "flux-system", + "annotate", "gitrepository", "flux-system", + "reconcile.fluxcd.io/requestedAt="+now, + "--overwrite", + ); err != nil { + return fmt.Errorf("annotate flux source reconcile: %w", err) + } + if _, err := o.kubectl( + ctx, + 25*time.Second, + "-n", "flux-system", + "annotate", + "kustomizations.kustomize.toolkit.fluxcd.io", + "--all", + "reconcile.fluxcd.io/requestedAt="+now, + "--overwrite", + ); err != nil { + return fmt.Errorf("annotate flux kustomizations reconcile: %w", err) + } + if _, err := o.kubectl( + ctx, + 25*time.Second, + "annotate", + "--all-namespaces", + "helmreleases.helm.toolkit.fluxcd.io", + "--all", + "reconcile.fluxcd.io/requestedAt="+now, + "--overwrite", + ); err != nil { + o.log.Printf("warning: annotate helmreleases for post-start reconcile failed: %v", err) + } + if o.runOverride == nil && o.runner.CommandExists("flux") { + if _, err := o.run(ctx, 75*time.Second, "flux", "reconcile", "source", "git", "flux-system", "-n", "flux-system", "--timeout=60s"); err != nil { + o.log.Printf("warning: flux source reconcile command failed during post-start auto-heal: %v", err) + } + } + return nil +} diff --git a/internal/cluster/orchestrator_autorepair_cleanup_test.go b/internal/cluster/orchestrator_autorepair_cleanup_test.go new file mode 100644 index 0000000..9df34f7 --- /dev/null +++ b/internal/cluster/orchestrator_autorepair_cleanup_test.go @@ -0,0 +1,296 @@ +package cluster + +import ( + "context" + "errors" + "io" + "log" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/execx" + "scm.bstein.dev/bstein/ananke/internal/state" +) + +// TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step. +// Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T). +// Why: cleanup on dead nodes must be selective so Ananke only force-deletes the +// truly stranded pods and tolerates already-gone objects. +func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) { + t.Run("dry run skips", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()) + if err != nil || count != 0 { + t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err) + } + }) + + t.Run("selective cleanup tolerates not found", func(t *testing.T) { + oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) + recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339) + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300}, + }, []commandStub{ + { + match: matchContains("kubectl", "get nodes -o json"), + out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, + }, + { + match: matchContains("kubectl", "get pods -A -o json"), + out: `{"items":[` + + `{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` + + `{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` + + `{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` + + `{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`, + }, + { + match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"), + err: errors.New("pod old-stale not found"), + }, + }) + + count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()) + if err != nil { + t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err) + } + if count != 1 { + t.Fatalf("expected one cleaned pod, got %d", count) + } + }) + + t.Run("query and decode errors surface", func(t *testing.T) { + queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "get nodes -o json"), + err: errors.New("nodes failed"), + }, + }) + if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") { + t.Fatalf("expected node query error, got %v", err) + } + + decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "get nodes -o json"), + out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, + }, + { + match: matchContains("kubectl", "get pods -A -o json"), + out: `{bad json`, + }, + }) + if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") { + t.Fatalf("expected pod decode error, got %v", err) + } + }) + + t.Run("delete hard error surfaces", func(t *testing.T) { + oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300}, + }, []commandStub{ + { + match: matchContains("kubectl", "get nodes -o json"), + out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`, + }, + { + match: matchContains("kubectl", "get pods -A -o json"), + out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`, + }, + { + match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"), + err: errors.New("delete failed"), + }, + }) + + count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()) + if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") { + t.Fatalf("expected delete failure, got count=%d err=%v", count, err) + } + }) +} + +// TestUnavailableNodeSetBranches runs one orchestration or CLI step. +// Signature: TestUnavailableNodeSetBranches(t *testing.T). +// Why: node Ready parsing drives dead-node cleanup, so malformed and missing +// Ready condition payloads need direct coverage too. +func TestUnavailableNodeSetBranches(t *testing.T) { + t.Run("decode error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + {match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`}, + }) + if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") { + t.Fatalf("expected decode error, got %v", err) + } + }) + + t.Run("missing ready condition counts as unavailable", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "get nodes -o json"), + out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, + }, + }) + nodes, err := orch.unavailableNodeSet(context.Background()) + if err != nil { + t.Fatalf("unavailableNodeSet failed: %v", err) + } + if _, ok := nodes["titan-22"]; !ok { + t.Fatalf("expected titan-22 to be treated as unavailable") + } + if _, ok := nodes["titan-07"]; ok { + t.Fatalf("did not expect titan-07 to be treated as unavailable") + } + }) +} + +// TestRequestFluxReconcileBranches runs one orchestration or CLI step. +// Signature: TestRequestFluxReconcileBranches(t *testing.T). +// Why: the post-start repair loop needs predictable Flux refresh behavior even +// when one annotation call is flaky. +func TestRequestFluxReconcileBranches(t *testing.T) { + t.Run("dry run skips", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + if err := orch.requestFluxReconcile(context.Background()); err != nil { + t.Fatalf("dry-run requestFluxReconcile failed: %v", err) + } + }) + + t.Run("git source annotate error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="), + err: errors.New("annotate failed"), + }, + }) + if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") { + t.Fatalf("expected gitrepository annotate error, got %v", err) + } + }) + + t.Run("kustomization annotate error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="), + out: "", + }, + { + match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="), + err: errors.New("annotate failed"), + }, + }) + if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") { + t.Fatalf("expected kustomization annotate error, got %v", err) + } + }) + + t.Run("helm annotate warning and flux command path", func(t *testing.T) { + tmpDir := t.TempDir() + callLog := filepath.Join(tmpDir, "calls.log") + kubectlPath := filepath.Join(tmpDir, "kubectl") + fluxPath := filepath.Join(tmpDir, "flux") + + kubectlScript := "#!/bin/sh\n" + + "printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" + + "case \"$*\" in\n" + + " *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" + + "esac\n" + + "exit 0\n" + fluxScript := "#!/bin/sh\n" + + "printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" + + "exit 0\n" + + if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil { + t.Fatalf("write fake kubectl: %v", err) + } + if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil { + t.Fatalf("write fake flux: %v", err) + } + t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH")) + + cfg := config.Config{ + State: config.State{ + Dir: t.TempDir(), + ReportsDir: filepath.Join(t.TempDir(), "reports"), + RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), + }, + } + orch := &Orchestrator{ + cfg: cfg, + runner: &execx.Runner{}, + store: state.New(cfg.State.RunHistoryPath), + log: log.New(io.Discard, "", 0), + } + + if err := orch.requestFluxReconcile(context.Background()); err != nil { + t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err) + } + calls, err := os.ReadFile(callLog) + if err != nil { + t.Fatalf("read fake command log: %v", err) + } + logText := string(calls) + if !strings.Contains(logText, "annotate gitrepository flux-system") { + t.Fatalf("expected gitrepository annotate call, got %q", logText) + } + if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") { + t.Fatalf("expected kustomization annotate call, got %q", logText) + } + if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") { + t.Fatalf("expected flux reconcile command, got %q", logText) + } + }) + + t.Run("flux command failure is tolerated", func(t *testing.T) { + tmpDir := t.TempDir() + callLog := filepath.Join(tmpDir, "calls.log") + kubectlPath := filepath.Join(tmpDir, "kubectl") + fluxPath := filepath.Join(tmpDir, "flux") + + kubectlScript := "#!/bin/sh\n" + + "printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" + + "exit 0\n" + fluxScript := "#!/bin/sh\n" + + "printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" + + "exit 1\n" + + if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil { + t.Fatalf("write fake kubectl: %v", err) + } + if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil { + t.Fatalf("write fake flux: %v", err) + } + t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH")) + + cfg := config.Config{ + State: config.State{ + Dir: t.TempDir(), + ReportsDir: filepath.Join(t.TempDir(), "reports"), + RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), + }, + } + orch := &Orchestrator{ + cfg: cfg, + runner: &execx.Runner{}, + store: state.New(cfg.State.RunHistoryPath), + log: log.New(io.Discard, "", 0), + } + + if err := orch.requestFluxReconcile(context.Background()); err != nil { + t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err) + } + calls, err := os.ReadFile(callLog) + if err != nil { + t.Fatalf("read fake command log: %v", err) + } + if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") { + t.Fatalf("expected failing flux command to be attempted, got %q", string(calls)) + } + }) +} diff --git a/internal/cluster/orchestrator_autorepair_test.go b/internal/cluster/orchestrator_autorepair_test.go new file mode 100644 index 0000000..4a80172 --- /dev/null +++ b/internal/cluster/orchestrator_autorepair_test.go @@ -0,0 +1,382 @@ +package cluster + +import ( + "context" + "encoding/base64" + "errors" + "io" + "log" + "path/filepath" + "strings" + "testing" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" + "scm.bstein.dev/bstein/ananke/internal/execx" + "scm.bstein.dev/bstein/ananke/internal/state" +) + +// TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step. +// Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T). +// Why: covers the new daemon-triggered repair path for late Vault reseals and +// stale terminating pods anchored to unavailable nodes. +func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) { + cfg := config.Config{ + Startup: config.Startup{ + DeadNodeCleanupGraceSeconds: 300, + RequiredNodeLabels: map[string]map[string]string{ + "titan-07": {"node-role.kubernetes.io/worker": "true"}, + }, + }, + State: config.State{ + Dir: t.TempDir(), + ReportsDir: filepath.Join(t.TempDir(), "reports"), + RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), + }, + } + orch := &Orchestrator{ + cfg: cfg, + runner: &execx.Runner{}, + store: state.New(filepath.Join(t.TempDir(), "runs.json")), + log: log.New(io.Discard, "", 0), + } + + oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) + unsealCalls := 0 + jobCreated := false + reconciled := false + deleted := map[string]bool{} + dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { + if name != "kubectl" { + return "", nil + } + joined := strings.Join(args, " ") + switch { + case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"): + return "", nil + case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): + return "Running", nil + case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"): + if unsealCalls == 0 { + return `{"initialized":true,"sealed":true}`, nil + } + return `{"initialized":true,"sealed":false}`, nil + case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"): + return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil + case strings.Contains(joined, "vault operator unseal"): + unsealCalls++ + return "", nil + case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"): + jobCreated = true + return "", nil + case strings.Contains(joined, "get nodes -o json"): + return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil + case strings.Contains(joined, "get pods -A -o json"): + return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil + case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"): + deleted["maintenance/stale-pod"] = true + return "", nil + case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="): + reconciled = true + return "", nil + case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="): + return "", nil + case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="): + return "", nil + default: + return "", nil + } + } + orch.SetCommandOverrides(dispatch, dispatch) + + if err := orch.postStartAutoHeal(context.Background()); err != nil { + t.Fatalf("postStartAutoHeal failed: %v", err) + } + if unsealCalls != 1 { + t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls) + } + if !jobCreated { + t.Fatalf("expected vault k8s auth config job to be created") + } + if !deleted["maintenance/stale-pod"] { + t.Fatalf("expected stale unavailable-node pod to be deleted") + } + if !reconciled { + t.Fatalf("expected flux reconcile request after repairs") + } + if deleted["logging/healthy-node-pod"] { + t.Fatalf("did not expect terminating pod on healthy node to be deleted") + } +} + +// TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step. +// Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T). +// Why: proves the new post-start repair loop stays quiet when the specific +// failure patterns are absent. +func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) { + cfg := config.Config{ + Startup: config.Startup{ + DeadNodeCleanupGraceSeconds: 300, + }, + State: config.State{ + Dir: t.TempDir(), + ReportsDir: filepath.Join(t.TempDir(), "reports"), + RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), + }, + } + orch := &Orchestrator{ + cfg: cfg, + runner: &execx.Runner{}, + store: state.New(filepath.Join(t.TempDir(), "runs.json")), + log: log.New(io.Discard, "", 0), + } + + unsealCalls := 0 + jobCreated := false + reconciled := false + dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { + if name != "kubectl" { + return "", nil + } + joined := strings.Join(args, " ") + switch { + case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): + return "Running", nil + case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"): + return `{"initialized":true,"sealed":false}`, nil + case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"): + jobCreated = true + return "", nil + case strings.Contains(joined, "vault operator unseal"): + unsealCalls++ + return "", nil + case strings.Contains(joined, "get nodes -o json"): + return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil + case strings.Contains(joined, "get pods -A -o json"): + return `{"items":[]}`, nil + case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="): + reconciled = true + return "", nil + default: + return "", nil + } + } + orch.SetCommandOverrides(dispatch, dispatch) + + if err := orch.postStartAutoHeal(context.Background()); err != nil { + t.Fatalf("postStartAutoHeal failed: %v", err) + } + if unsealCalls != 0 { + t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls) + } + if jobCreated { + t.Fatalf("did not expect vault auth config job creation") + } + if reconciled { + t.Fatalf("did not expect flux reconcile request for healthy cluster") + } +} + +// TestRunPostStartAutoHealDryRun runs one orchestration or CLI step. +// Signature: TestRunPostStartAutoHealDryRun(t *testing.T). +// Why: covers the exported wrapper and the top-level dry-run guard so daemon +// auto-heal never mutates cluster state during rehearsal runs. +func TestRunPostStartAutoHealDryRun(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + + if err := orch.RunPostStartAutoHeal(context.Background()); err != nil { + t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err) + } +} + +// TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step. +// Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T). +// Why: proves the daemon reports each failed sub-repair together instead of +// hiding later failures behind the first problem. +func TestPostStartAutoHealAggregatesErrors(t *testing.T) { + cfg := config.Config{ + Startup: config.Startup{ + DeadNodeCleanupGraceSeconds: 300, + RequiredNodeLabels: map[string]map[string]string{ + "titan-07": {"node-role.kubernetes.io/worker": "true"}, + }, + }, + } + orch := buildOrchestratorWithStubs(t, cfg, []commandStub{ + { + match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"), + err: errors.New("label failed"), + }, + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + err: errors.New("vault phase failed"), + }, + { + match: matchContains("kubectl", "get nodes -o json"), + err: errors.New("node query failed"), + }, + }) + + err := orch.postStartAutoHeal(context.Background()) + if err == nil { + t.Fatalf("expected aggregated error") + } + msg := err.Error() + for _, want := range []string{ + "required node labels:", + "vault auto-recovery:", + "dead-node terminating pod cleanup:", + } { + if !strings.Contains(msg, want) { + t.Fatalf("expected %q in %q", want, msg) + } + } +} + +// TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step. +// Signature: TestAutoRecoverSealedVaultBranches(t *testing.T). +// Why: late Vault reseals are a high-risk failure path, so the daemon needs +// coverage across the quiet-skip, parse-failure, and unseal-failure branches. +func TestAutoRecoverSealedVaultBranches(t *testing.T) { + t.Run("dry run skips", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if err != nil || recovered { + t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err) + } + }) + + t.Run("pod missing is quiet", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + err: errors.New("vault-0 not found"), + }, + }) + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if err != nil || recovered { + t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err) + } + }) + + t.Run("phase check error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + err: errors.New("phase check failed"), + }, + }) + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") { + t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err) + } + }) + + t.Run("non-running pod defers", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + out: "Pending", + }, + }) + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if err != nil || recovered { + t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err) + } + }) + + t.Run("status parse failure surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + out: "Running", + }, + { + match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"), + out: "garbage", + }, + }) + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") { + t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err) + } + }) + + t.Run("already unsealed stays quiet", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + out: "Running", + }, + { + match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"), + out: `{"sealed":false}`, + }, + }) + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if err != nil || recovered { + t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err) + } + }) + + t.Run("unseal failure surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), + out: "Running", + }, + { + match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"), + out: `{"sealed":true}`, + }, + { + match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"), + out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), + }, + { + match: matchContains("kubectl", "vault operator unseal"), + err: errors.New("exec boom"), + }, + }) + + recovered, err := orch.autoRecoverSealedVault(context.Background()) + if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") { + t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err) + } + }) +} + +// TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step. +// Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T). +// Why: the post-unseal auth job is part of the production recovery chain, so +// dry-run and create-error behavior both need explicit coverage. +func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) { + t.Run("dry run skips", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, nil) + orch.runner.DryRun = true + if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil { + t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err) + } + }) + + t.Run("create error surfaces", func(t *testing.T) { + orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ + { + match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"), + err: errors.New("create failed"), + }, + }) + err := orch.rerunVaultK8sAuthConfigJob(context.Background()) + if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") { + t.Fatalf("expected create-job error, got %v", err) + } + }) +} diff --git a/internal/config/apply_defaults.go b/internal/config/apply_defaults.go index 58700d4..5325598 100644 --- a/internal/config/apply_defaults.go +++ b/internal/config/apply_defaults.go @@ -195,6 +195,12 @@ func (c *Config) applyDefaults() { if c.Startup.StuckPodGraceSeconds <= 0 { c.Startup.StuckPodGraceSeconds = 180 } + if c.Startup.PostStartAutoHealSeconds <= 0 { + c.Startup.PostStartAutoHealSeconds = 60 + } + if c.Startup.DeadNodeCleanupGraceSeconds <= 0 { + c.Startup.DeadNodeCleanupGraceSeconds = 300 + } if strings.TrimSpace(c.Startup.VaultUnsealKeyFile) == "" { c.Startup.VaultUnsealKeyFile = "/var/lib/ananke/vault-unseal.key" } diff --git a/internal/config/types.go b/internal/config/types.go index 46bbf0e..80c432c 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -91,6 +91,8 @@ type Startup struct { SchedulingStormEventThreshold int `yaml:"scheduling_storm_event_threshold"` SchedulingStormWindowSeconds int `yaml:"scheduling_storm_window_seconds"` StuckPodGraceSeconds int `yaml:"stuck_pod_grace_seconds"` + PostStartAutoHealSeconds int `yaml:"post_start_auto_heal_seconds"` + DeadNodeCleanupGraceSeconds int `yaml:"dead_node_cleanup_grace_seconds"` VaultUnsealKeyFile string `yaml:"vault_unseal_key_file"` VaultUnsealBreakglassCommand string `yaml:"vault_unseal_breakglass_command"` VaultUnsealBreakglassTimeout int `yaml:"vault_unseal_breakglass_timeout_seconds"` diff --git a/internal/config/validate.go b/internal/config/validate.go index 84f97a9..6dcc34d 100644 --- a/internal/config/validate.go +++ b/internal/config/validate.go @@ -272,6 +272,12 @@ func (c Config) Validate() error { if c.Startup.StuckPodGraceSeconds <= 0 { return fmt.Errorf("config.startup.stuck_pod_grace_seconds must be > 0") } + if c.Startup.PostStartAutoHealSeconds <= 0 { + return fmt.Errorf("config.startup.post_start_auto_heal_seconds must be > 0") + } + if c.Startup.DeadNodeCleanupGraceSeconds <= 0 { + return fmt.Errorf("config.startup.dead_node_cleanup_grace_seconds must be > 0") + } for _, probe := range c.Startup.PostStartProbes { if strings.TrimSpace(probe) == "" { return fmt.Errorf("config.startup.post_start_probes entries must not be empty") diff --git a/internal/config/validate_matrix_test.go b/internal/config/validate_matrix_test.go index 3d61e3f..da77bb9 100644 --- a/internal/config/validate_matrix_test.go +++ b/internal/config/validate_matrix_test.go @@ -78,6 +78,8 @@ func TestValidateRejectsInvalidFieldsMatrix(t *testing.T) { {"bad_workload_poll", func(c *Config) { c.Startup.WorkloadConvergencePollSeconds = 0 }}, {"bad_empty_required_workload_namespace", func(c *Config) { c.Startup.WorkloadConvergenceRequiredNamespaces = []string{"monitoring", ""} }}, {"bad_stuck_pod_grace", func(c *Config) { c.Startup.StuckPodGraceSeconds = 0 }}, + {"bad_post_start_auto_heal_seconds", func(c *Config) { c.Startup.PostStartAutoHealSeconds = 0 }}, + {"bad_dead_node_cleanup_grace_seconds", func(c *Config) { c.Startup.DeadNodeCleanupGraceSeconds = 0 }}, {"bad_empty_post_start_probe_entry", func(c *Config) { c.Startup.PostStartProbes = []string{"https://ok", ""} }}, {"bad_empty_ignore_flux_entry", func(c *Config) { c.Startup.IgnoreFluxKustomizations = []string{"", "ns/name"} }}, {"bad_empty_ignore_workloads_entry", func(c *Config) { c.Startup.IgnoreWorkloads = []string{"", "ns/name"} }}, @@ -143,6 +145,9 @@ func TestApplyDefaultsPopulatesZeroConfig(t *testing.T) { if cfg.Startup.TimeSyncMode == "" || cfg.Startup.EtcdRestoreControlPlane == "" || cfg.Startup.VaultUnsealKeyFile == "" { t.Fatalf("expected startup defaults to be set") } + if cfg.Startup.PostStartAutoHealSeconds <= 0 || cfg.Startup.DeadNodeCleanupGraceSeconds <= 0 { + t.Fatalf("expected post-start auto-heal defaults to be set") + } if cfg.Startup.NodeInventoryReachRequiredNodes == nil || cfg.Startup.NodeSSHAuthRequiredNodes == nil || cfg.Startup.FluxHealthRequiredKustomizations == nil || cfg.Startup.WorkloadConvergenceRequiredNamespaces == nil { t.Fatalf("expected startup recovery scope slices to be initialized") diff --git a/internal/service/daemon.go b/internal/service/daemon.go index 0c48fb9..a8c9adf 100644 --- a/internal/service/daemon.go +++ b/internal/service/daemon.go @@ -32,6 +32,8 @@ type Daemon struct { targets []Target log *log.Logger exporter *metrics.Exporter + + postStartAutoHealOverride func(context.Context) error } var sshConfigCandidates = []string{ @@ -94,6 +96,7 @@ func (d *Daemon) Run(ctx context.Context) error { lastOnBattery := map[string]bool{} onBatterySince := map[string]time.Time{} breachCount := map[string]int{} + lastAutoHeal := time.Time{} for _, t := range d.targets { lastGood[t.Name] = time.Now() } @@ -108,12 +111,16 @@ func (d *Daemon) Run(ctx context.Context) error { case <-t.C: budget := d.orch.EstimatedEmergencyShutdownSeconds() threshold := int(math.Ceil(float64(budget) * d.cfg.UPS.RuntimeSafetyFactor)) + anyOnBattery := false d.exporter.UpdateBudget(budget) for _, target := range d.targets { sample, err := target.Provider.Read(ctx) if err != nil { + if lastOnBattery[target.Name] { + anyOnBattery = true + } d.log.Printf("warning: ups read failed target=%s (%s): %v", target.Name, target.Target, err) d.exporter.UpdateSample(metrics.Sample{ Name: target.Name, @@ -132,6 +139,9 @@ func (d *Daemon) Run(ctx context.Context) error { } lastGood[target.Name] = time.Now() + if sample.OnBattery { + anyOnBattery = true + } wasOnBattery := lastOnBattery[target.Name] if sample.OnBattery { if !wasOnBattery || onBatterySince[target.Name].IsZero() { @@ -189,10 +199,51 @@ func (d *Daemon) Run(ctx context.Context) error { return d.triggerShutdown(ctx, triggerReason) } } + + d.maybeRunPostStartAutoHeal(ctx, &lastAutoHeal, anyOnBattery) } } } +// maybeRunPostStartAutoHeal runs one orchestration or CLI step. +// Signature: (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool). +// Why: gives the long-running daemon a bounded path to repair post-start drift +// like a later Vault reseal or stale dead-node deletions without waiting for a +// fresh bootstrap run. +func (d *Daemon) maybeRunPostStartAutoHeal(ctx context.Context, lastRun *time.Time, anyOnBattery bool) { + interval := time.Duration(d.cfg.Startup.PostStartAutoHealSeconds) * time.Second + if interval <= 0 || anyOnBattery { + return + } + if d.orch == nil && d.postStartAutoHealOverride == nil { + return + } + now := time.Now() + if lastRun != nil && !lastRun.IsZero() && now.Sub(*lastRun) < interval { + return + } + if lastRun != nil { + *lastRun = now + } + if err := d.runPostStartAutoHeal(ctx); err != nil { + d.log.Printf("warning: post-start auto-heal: %v", err) + } +} + +// runPostStartAutoHeal runs one orchestration or CLI step. +// Signature: (d *Daemon) runPostStartAutoHeal(ctx context.Context) error. +// Why: keeps the daemon loop readable while allowing unit tests to inject a +// deterministic repair hook without a live cluster. +func (d *Daemon) runPostStartAutoHeal(ctx context.Context) error { + if d.postStartAutoHealOverride != nil { + return d.postStartAutoHealOverride(ctx) + } + if d.orch == nil { + return nil + } + return d.orch.RunPostStartAutoHeal(ctx) +} + // triggerShutdown runs one orchestration or CLI step. // Signature: (d *Daemon) triggerShutdown(ctx context.Context, reason string) error. // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. diff --git a/internal/service/daemon_poststart_autorepair_test.go b/internal/service/daemon_poststart_autorepair_test.go new file mode 100644 index 0000000..1aa34a8 --- /dev/null +++ b/internal/service/daemon_poststart_autorepair_test.go @@ -0,0 +1,51 @@ +package service + +import ( + "context" + "testing" + "time" + + "scm.bstein.dev/bstein/ananke/internal/config" +) + +// TestDaemonMaybeRunPostStartAutoHeal runs one orchestration or CLI step. +// Signature: TestDaemonMaybeRunPostStartAutoHeal(t *testing.T). +// Why: covers the daemon-side interval and on-battery guards for the new +// post-start repair loop. +func TestDaemonMaybeRunPostStartAutoHeal(t *testing.T) { + calls := 0 + d := &Daemon{ + cfg: config.Config{ + Startup: config.Startup{ + PostStartAutoHealSeconds: 10, + }, + }, + postStartAutoHealOverride: func(context.Context) error { + calls++ + return nil + }, + } + + var last time.Time + d.maybeRunPostStartAutoHeal(context.Background(), &last, false) + if calls != 1 { + t.Fatalf("expected first auto-heal invocation, got %d", calls) + } + + d.maybeRunPostStartAutoHeal(context.Background(), &last, false) + if calls != 1 { + t.Fatalf("expected interval guard to suppress second call, got %d", calls) + } + + last = time.Now().Add(-11 * time.Second) + d.maybeRunPostStartAutoHeal(context.Background(), &last, true) + if calls != 1 { + t.Fatalf("expected on-battery guard to suppress call, got %d", calls) + } + + last = time.Now().Add(-11 * time.Second) + d.maybeRunPostStartAutoHeal(context.Background(), &last, false) + if calls != 2 { + t.Fatalf("expected second allowed auto-heal call, got %d", calls) + } +} diff --git a/testing/hygiene/in_tree_test_allowlist.txt b/testing/hygiene/in_tree_test_allowlist.txt index e3c8414..8c2a203 100644 --- a/testing/hygiene/in_tree_test_allowlist.txt +++ b/testing/hygiene/in_tree_test_allowlist.txt @@ -13,6 +13,8 @@ cmd/ananke/power_safety_test.go cmd/ananke/test_helpers_test.go internal/cluster/orchestrator_inventory_test.go internal/cluster/orchestrator_report_test.go +internal/cluster/orchestrator_autorepair_test.go +internal/cluster/orchestrator_autorepair_cleanup_test.go internal/cluster/orchestrator_test.go internal/cluster/orchestrator_unit_additional_test.go internal/cluster/orchestrator_vault_test.go @@ -21,6 +23,7 @@ internal/config/load_additional_test.go internal/config/validate_matrix_test.go internal/service/daemon_additional_test.go internal/service/daemon_coverage_closeout_test.go +internal/service/daemon_poststart_autorepair_test.go internal/service/daemon_quality_branches_test.go internal/service/daemon_test.go internal/sshutil/repair_test.go