package cluster import ( "context" "encoding/base64" "errors" "io" "log" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestPostStartAutoHealRepairsVaultAndUnavailableNodes runs one orchestration or CLI step. // Signature: TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T). // Why: covers the new daemon-triggered repair path for late Vault reseals and // stale terminating pods anchored to unavailable nodes. func TestPostStartAutoHealRepairsVaultAndUnavailableNodes(t *testing.T) { cfg := config.Config{ Startup: config.Startup{ DeadNodeCleanupGraceSeconds: 300, RequiredNodeLabels: map[string]map[string]string{ "titan-07": {"node-role.kubernetes.io/worker": "true"}, }, }, State: config.State{ Dir: t.TempDir(), ReportsDir: filepath.Join(t.TempDir(), "reports"), RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), }, } orch := &Orchestrator{ cfg: cfg, runner: &execx.Runner{}, store: state.New(filepath.Join(t.TempDir(), "runs.json")), log: log.New(io.Discard, "", 0), } oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) unsealCalls := 0 jobCreated := false reconciled := false deleted := map[string]bool{} dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { if name != "kubectl" { return "", nil } joined := strings.Join(args, " ") switch { case strings.Contains(joined, "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"): return "", nil case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): return "Running", nil case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"): if unsealCalls == 0 { return `{"initialized":true,"sealed":true}`, nil } return `{"initialized":true,"sealed":false}`, nil case strings.Contains(joined, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"): return base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), nil case strings.Contains(joined, "vault operator unseal"): unsealCalls++ return "", nil case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"): jobCreated = true return "", nil case strings.Contains(joined, "get nodes -o json"): return `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil case strings.Contains(joined, "get pods -A -o json"): return `{"items":[{"metadata":{"namespace":"maintenance","name":"stale-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},{"metadata":{"namespace":"logging","name":"healthy-node-pod","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}}]}`, nil case strings.Contains(joined, "-n maintenance delete pod stale-pod --grace-period=0 --force --wait=false"): deleted["maintenance/stale-pod"] = true return "", nil case strings.Contains(joined, "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="): reconciled = true return "", nil case strings.Contains(joined, "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="): return "", nil case strings.Contains(joined, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="): return "", nil default: return "", nil } } orch.SetCommandOverrides(dispatch, dispatch) if err := orch.postStartAutoHeal(context.Background()); err != nil { t.Fatalf("postStartAutoHeal failed: %v", err) } if unsealCalls != 1 { t.Fatalf("expected one Vault unseal attempt, got %d", unsealCalls) } if !jobCreated { t.Fatalf("expected vault k8s auth config job to be created") } if !deleted["maintenance/stale-pod"] { t.Fatalf("expected stale unavailable-node pod to be deleted") } if !reconciled { t.Fatalf("expected flux reconcile request after repairs") } if deleted["logging/healthy-node-pod"] { t.Fatalf("did not expect terminating pod on healthy node to be deleted") } } // TestPostStartAutoHealSkipsWhenClusterIsHealthy runs one orchestration or CLI step. // Signature: TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T). // Why: proves the new post-start repair loop stays quiet when the specific // failure patterns are absent. func TestPostStartAutoHealSkipsWhenClusterIsHealthy(t *testing.T) { cfg := config.Config{ Startup: config.Startup{ DeadNodeCleanupGraceSeconds: 300, }, State: config.State{ Dir: t.TempDir(), ReportsDir: filepath.Join(t.TempDir(), "reports"), RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), }, } orch := &Orchestrator{ cfg: cfg, runner: &execx.Runner{}, store: state.New(filepath.Join(t.TempDir(), "runs.json")), log: log.New(io.Discard, "", 0), } unsealCalls := 0 jobCreated := false reconciled := false dispatch := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { if name != "kubectl" { return "", nil } joined := strings.Join(args, " ") switch { case strings.Contains(joined, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): return "Running", nil case strings.Contains(joined, "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"): return `{"initialized":true,"sealed":false}`, nil case strings.Contains(joined, "-n vault create job --from=cronjob/vault-k8s-auth-config"): jobCreated = true return "", nil case strings.Contains(joined, "vault operator unseal"): unsealCalls++ return "", nil case strings.Contains(joined, "get nodes -o json"): return `{"items":[{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, nil case strings.Contains(joined, "get pods -A -o json"): return `{"items":[]}`, nil case strings.Contains(joined, "reconcile.fluxcd.io/requestedAt="): reconciled = true return "", nil default: return "", nil } } orch.SetCommandOverrides(dispatch, dispatch) if err := orch.postStartAutoHeal(context.Background()); err != nil { t.Fatalf("postStartAutoHeal failed: %v", err) } if unsealCalls != 0 { t.Fatalf("did not expect Vault unseal calls, got %d", unsealCalls) } if jobCreated { t.Fatalf("did not expect vault auth config job creation") } if reconciled { t.Fatalf("did not expect flux reconcile request for healthy cluster") } } // TestRunPostStartAutoHealDryRun runs one orchestration or CLI step. // Signature: TestRunPostStartAutoHealDryRun(t *testing.T). // Why: covers the exported wrapper and the top-level dry-run guard so daemon // auto-heal never mutates cluster state during rehearsal runs. func TestRunPostStartAutoHealDryRun(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, nil) orch.runner.DryRun = true if err := orch.RunPostStartAutoHeal(context.Background()); err != nil { t.Fatalf("RunPostStartAutoHeal dry-run failed: %v", err) } } // TestPostStartAutoHealAggregatesErrors runs one orchestration or CLI step. // Signature: TestPostStartAutoHealAggregatesErrors(t *testing.T). // Why: proves the daemon reports each failed sub-repair together instead of // hiding later failures behind the first problem. func TestPostStartAutoHealAggregatesErrors(t *testing.T) { cfg := config.Config{ Startup: config.Startup{ DeadNodeCleanupGraceSeconds: 300, RequiredNodeLabels: map[string]map[string]string{ "titan-07": {"node-role.kubernetes.io/worker": "true"}, }, }, } orch := buildOrchestratorWithStubs(t, cfg, []commandStub{ { match: matchContains("kubectl", "label node titan-07 --overwrite node-role.kubernetes.io/worker=true"), err: errors.New("label failed"), }, { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), err: errors.New("vault phase failed"), }, { match: matchContains("kubectl", "get nodes -o json"), err: errors.New("node query failed"), }, }) err := orch.postStartAutoHeal(context.Background()) if err == nil { t.Fatalf("expected aggregated error") } msg := err.Error() for _, want := range []string{ "required node labels:", "vault auto-recovery:", "dead-node terminating pod cleanup:", } { if !strings.Contains(msg, want) { t.Fatalf("expected %q in %q", want, msg) } } } // TestAutoRecoverSealedVaultBranches runs one orchestration or CLI step. // Signature: TestAutoRecoverSealedVaultBranches(t *testing.T). // Why: late Vault reseals are a high-risk failure path, so the daemon needs // coverage across the quiet-skip, parse-failure, and unseal-failure branches. func TestAutoRecoverSealedVaultBranches(t *testing.T) { t.Run("dry run skips", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, nil) orch.runner.DryRun = true recovered, err := orch.autoRecoverSealedVault(context.Background()) if err != nil || recovered { t.Fatalf("expected dry-run skip, got recovered=%v err=%v", recovered, err) } }) t.Run("pod missing is quiet", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), err: errors.New("vault-0 not found"), }, }) recovered, err := orch.autoRecoverSealedVault(context.Background()) if err != nil || recovered { t.Fatalf("expected quiet skip, got recovered=%v err=%v", recovered, err) } }) t.Run("phase check error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), err: errors.New("phase check failed"), }, }) recovered, err := orch.autoRecoverSealedVault(context.Background()) if recovered || err == nil || !strings.Contains(err.Error(), "vault pod phase check failed") { t.Fatalf("expected phase check error, got recovered=%v err=%v", recovered, err) } }) t.Run("non-running pod defers", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), out: "Pending", }, }) recovered, err := orch.autoRecoverSealedVault(context.Background()) if err != nil || recovered { t.Fatalf("expected pending pod skip, got recovered=%v err=%v", recovered, err) } }) t.Run("status parse failure surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), out: "Running", }, { match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"), out: "garbage", }, }) recovered, err := orch.autoRecoverSealedVault(context.Background()) if recovered || err == nil || !strings.Contains(err.Error(), "parse vault status") { t.Fatalf("expected parse error, got recovered=%v err=%v", recovered, err) } }) t.Run("already unsealed stays quiet", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), out: "Running", }, { match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"), out: `{"sealed":false}`, }, }) recovered, err := orch.autoRecoverSealedVault(context.Background()) if err != nil || recovered { t.Fatalf("expected already-unsealed skip, got recovered=%v err=%v", recovered, err) } }) t.Run("unseal failure surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault get pod vault-0 -o jsonpath={.status.phase}"), out: "Running", }, { match: matchContains("kubectl", "VAULT_ADDR=http://127.0.0.1:8200 vault status -format=json"), out: `{"sealed":true}`, }, { match: matchContains("kubectl", "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"), out: base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")), }, { match: matchContains("kubectl", "vault operator unseal"), err: errors.New("exec boom"), }, }) recovered, err := orch.autoRecoverSealedVault(context.Background()) if recovered || err == nil || !strings.Contains(err.Error(), "vault unseal attempt 1 failed") { t.Fatalf("expected unseal failure, got recovered=%v err=%v", recovered, err) } }) } // TestRerunVaultK8sAuthConfigJobBranches runs one orchestration or CLI step. // Signature: TestRerunVaultK8sAuthConfigJobBranches(t *testing.T). // Why: the post-unseal auth job is part of the production recovery chain, so // dry-run and create-error behavior both need explicit coverage. func TestRerunVaultK8sAuthConfigJobBranches(t *testing.T) { t.Run("dry run skips", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, nil) orch.runner.DryRun = true if err := orch.rerunVaultK8sAuthConfigJob(context.Background()); err != nil { t.Fatalf("dry-run rerunVaultK8sAuthConfigJob failed: %v", err) } }) t.Run("create error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n vault create job --from=cronjob/vault-k8s-auth-config"), err: errors.New("create failed"), }, }) err := orch.rerunVaultK8sAuthConfigJob(context.Background()) if err == nil || !strings.Contains(err.Error(), "create job vault-k8s-auth-config-autoheal-") { t.Fatalf("expected create-job error, got %v", err) } }) }