package cluster import ( "context" "errors" "io" "log" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestCleanupTerminatingPodsOnUnavailableNodesBranches runs one orchestration or CLI step. // Signature: TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T). // Why: cleanup on dead nodes must be selective so Ananke only force-deletes the // truly stranded pods and tolerates already-gone objects. func TestCleanupTerminatingPodsOnUnavailableNodesBranches(t *testing.T) { t.Run("dry run skips", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, nil) orch.runner.DryRun = true count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()) if err != nil || count != 0 { t.Fatalf("expected dry-run skip, got count=%d err=%v", count, err) } }) t.Run("selective cleanup tolerates not found", func(t *testing.T) { oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) recentDelete := time.Now().Add(-2 * time.Minute).UTC().Format(time.RFC3339) orch := buildOrchestratorWithStubs(t, config.Config{ Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300}, }, []commandStub{ { match: matchContains("kubectl", "get nodes -o json"), out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, }, { match: matchContains("kubectl", "get pods -A -o json"), out: `{"items":[` + `{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}},` + `{"metadata":{"namespace":"maintenance","name":"fresh-stale","deletionTimestamp":"` + recentDelete + `"},"spec":{"nodeName":"titan-22"}},` + `{"metadata":{"namespace":"logging","name":"healthy-node","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-18"}},` + `{"metadata":{"namespace":"logging","name":"no-delete"},"spec":{"nodeName":"titan-22"}}]}`, }, { match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"), err: errors.New("pod old-stale not found"), }, }) count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()) if err != nil { t.Fatalf("cleanupTerminatingPodsOnUnavailableNodes failed: %v", err) } if count != 1 { t.Fatalf("expected one cleaned pod, got %d", count) } }) t.Run("query and decode errors surface", func(t *testing.T) { queryErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "get nodes -o json"), err: errors.New("nodes failed"), }, }) if _, err := queryErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "query nodes") { t.Fatalf("expected node query error, got %v", err) } decodeErrOrch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "get nodes -o json"), out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"Unknown"}]}}]}`, }, { match: matchContains("kubectl", "get pods -A -o json"), out: `{bad json`, }, }) if _, err := decodeErrOrch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()); err == nil || !strings.Contains(err.Error(), "decode pods") { t.Fatalf("expected pod decode error, got %v", err) } }) t.Run("delete hard error surfaces", func(t *testing.T) { oldDelete := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) orch := buildOrchestratorWithStubs(t, config.Config{ Startup: config.Startup{DeadNodeCleanupGraceSeconds: 300}, }, []commandStub{ { match: matchContains("kubectl", "get nodes -o json"), out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`, }, { match: matchContains("kubectl", "get pods -A -o json"), out: `{"items":[{"metadata":{"namespace":"maintenance","name":"old-stale","deletionTimestamp":"` + oldDelete + `"},"spec":{"nodeName":"titan-22"}}]}`, }, { match: matchContains("kubectl", "-n maintenance delete pod old-stale --grace-period=0 --force --wait=false"), err: errors.New("delete failed"), }, }) count, err := orch.cleanupTerminatingPodsOnUnavailableNodes(context.Background()) if count != 0 || err == nil || !strings.Contains(err.Error(), "delete pod maintenance/old-stale") { t.Fatalf("expected delete failure, got count=%d err=%v", count, err) } }) } // TestUnavailableNodeSetBranches runs one orchestration or CLI step. // Signature: TestUnavailableNodeSetBranches(t *testing.T). // Why: node Ready parsing drives dead-node cleanup, so malformed and missing // Ready condition payloads need direct coverage too. func TestUnavailableNodeSetBranches(t *testing.T) { t.Run("decode error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ {match: matchContains("kubectl", "get nodes -o json"), out: `{bad json`}, }) if _, err := orch.unavailableNodeSet(context.Background()); err == nil || !strings.Contains(err.Error(), "decode nodes") { t.Fatalf("expected decode error, got %v", err) } }) t.Run("missing ready condition counts as unavailable", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "get nodes -o json"), out: `{"items":[{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"MemoryPressure","status":"False"}]}},{"metadata":{"name":"titan-07"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`, }, }) nodes, err := orch.unavailableNodeSet(context.Background()) if err != nil { t.Fatalf("unavailableNodeSet failed: %v", err) } if _, ok := nodes["titan-22"]; !ok { t.Fatalf("expected titan-22 to be treated as unavailable") } if _, ok := nodes["titan-07"]; ok { t.Fatalf("did not expect titan-07 to be treated as unavailable") } }) } // TestRequestFluxReconcileBranches runs one orchestration or CLI step. // Signature: TestRequestFluxReconcileBranches(t *testing.T). // Why: the post-start repair loop needs predictable Flux refresh behavior even // when one annotation call is flaky. func TestRequestFluxReconcileBranches(t *testing.T) { t.Run("dry run skips", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, nil) orch.runner.DryRun = true if err := orch.requestFluxReconcile(context.Background()); err != nil { t.Fatalf("dry-run requestFluxReconcile failed: %v", err) } }) t.Run("git source annotate error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="), err: errors.New("annotate failed"), }, }) if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux source reconcile") { t.Fatalf("expected gitrepository annotate error, got %v", err) } }) t.Run("kustomization annotate error surfaces", func(t *testing.T) { orch := buildOrchestratorWithStubs(t, config.Config{}, []commandStub{ { match: matchContains("kubectl", "-n flux-system annotate gitrepository flux-system reconcile.fluxcd.io/requestedAt="), out: "", }, { match: matchContains("kubectl", "-n flux-system annotate kustomizations.kustomize.toolkit.fluxcd.io --all reconcile.fluxcd.io/requestedAt="), err: errors.New("annotate failed"), }, }) if err := orch.requestFluxReconcile(context.Background()); err == nil || !strings.Contains(err.Error(), "annotate flux kustomizations reconcile") { t.Fatalf("expected kustomization annotate error, got %v", err) } }) t.Run("helm annotate warning and flux command path", func(t *testing.T) { tmpDir := t.TempDir() callLog := filepath.Join(tmpDir, "calls.log") kubectlPath := filepath.Join(tmpDir, "kubectl") fluxPath := filepath.Join(tmpDir, "flux") kubectlScript := "#!/bin/sh\n" + "printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" + "case \"$*\" in\n" + " *helmreleases.helm.toolkit.fluxcd.io*) echo helm annotate failed >&2; exit 1 ;;\n" + "esac\n" + "exit 0\n" fluxScript := "#!/bin/sh\n" + "printf 'flux %s\\n' \"$*\" >> \"" + callLog + "\"\n" + "exit 0\n" if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil { t.Fatalf("write fake kubectl: %v", err) } if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil { t.Fatalf("write fake flux: %v", err) } t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH")) cfg := config.Config{ State: config.State{ Dir: t.TempDir(), ReportsDir: filepath.Join(t.TempDir(), "reports"), RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), }, } orch := &Orchestrator{ cfg: cfg, runner: &execx.Runner{}, store: state.New(cfg.State.RunHistoryPath), log: log.New(io.Discard, "", 0), } if err := orch.requestFluxReconcile(context.Background()); err != nil { t.Fatalf("requestFluxReconcile with fake binaries failed: %v", err) } calls, err := os.ReadFile(callLog) if err != nil { t.Fatalf("read fake command log: %v", err) } logText := string(calls) if !strings.Contains(logText, "annotate gitrepository flux-system") { t.Fatalf("expected gitrepository annotate call, got %q", logText) } if !strings.Contains(logText, "annotate kustomizations.kustomize.toolkit.fluxcd.io --all") { t.Fatalf("expected kustomization annotate call, got %q", logText) } if !strings.Contains(logText, "flux reconcile source git flux-system -n flux-system --timeout=60s") { t.Fatalf("expected flux reconcile command, got %q", logText) } }) t.Run("flux command failure is tolerated", func(t *testing.T) { tmpDir := t.TempDir() callLog := filepath.Join(tmpDir, "calls.log") kubectlPath := filepath.Join(tmpDir, "kubectl") fluxPath := filepath.Join(tmpDir, "flux") kubectlScript := "#!/bin/sh\n" + "printf '%s\\n' \"$*\" >> \"" + callLog + "\"\n" + "exit 0\n" fluxScript := "#!/bin/sh\n" + "printf 'flux-fail %s\\n' \"$*\" >> \"" + callLog + "\"\n" + "exit 1\n" if err := os.WriteFile(kubectlPath, []byte(kubectlScript), 0o755); err != nil { t.Fatalf("write fake kubectl: %v", err) } if err := os.WriteFile(fluxPath, []byte(fluxScript), 0o755); err != nil { t.Fatalf("write fake flux: %v", err) } t.Setenv("PATH", tmpDir+":"+os.Getenv("PATH")) cfg := config.Config{ State: config.State{ Dir: t.TempDir(), ReportsDir: filepath.Join(t.TempDir(), "reports"), RunHistoryPath: filepath.Join(t.TempDir(), "runs.json"), }, } orch := &Orchestrator{ cfg: cfg, runner: &execx.Runner{}, store: state.New(cfg.State.RunHistoryPath), log: log.New(io.Discard, "", 0), } if err := orch.requestFluxReconcile(context.Background()); err != nil { t.Fatalf("requestFluxReconcile should tolerate flux failure, got %v", err) } calls, err := os.ReadFile(callLog) if err != nil { t.Fatalf("read fake command log: %v", err) } if !strings.Contains(string(calls), "flux-fail reconcile source git flux-system -n flux-system --timeout=60s") { t.Fatalf("expected failing flux command to be attempted, got %q", string(calls)) } }) }