package orchestrator import ( "context" "errors" "io" "log" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T). // Why: drives low-coverage time-sync, datastore parsing, and startup stability // branches from the top-level testing module. func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) { t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) { cases := []struct { line string want string }{ {"ExecStart=/usr/local/bin/k3s server", ""}, {"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://db:5432/k3s", "postgres://db:5432/k3s"}, {"ExecStart=/usr/local/bin/k3s server --datastore-endpoint='postgres://db:5432/k3s' \\", "postgres://db:5432/k3s"}, {"ExecStart=/usr/local/bin/k3s server --datastore-endpoint = \"postgres://db:5432/k3s\" \\", "="}, {"X --datastore-endpoint= \"postgres://db:5432/k3s\" ", "postgres://db:5432/k3s"}, } for _, tc := range cases { got := cluster.TestHookParseDatastoreEndpoint(tc.line) if got != tc.want { t.Fatalf("parseDatastoreEndpoint(%q)=%q want %q", tc.line, got, tc.want) } } }) t.Run("wait-for-time-sync-strict-timeout", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.TimeSyncMode = "strict" cfg.Startup.TimeSyncWaitSeconds = 1 cfg.Startup.TimeSyncPollSeconds = 1 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") { return "no", nil } if name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") { return "no", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"}) if err == nil || !strings.Contains(err.Error(), "time sync not ready") { t.Fatalf("expected strict time-sync timeout branch, got %v", err) } }) t.Run("wait-for-time-sync-quorum-success", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = []string{"titan-db", "titan-23"} cfg.Startup.TimeSyncMode = "quorum" cfg.Startup.TimeSyncQuorum = 1 cfg.Startup.TimeSyncWaitSeconds = 2 cfg.Startup.TimeSyncPollSeconds = 1 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"): return "yes", nil case name == "ssh" && strings.Contains(command, "titan-db") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"): return "yes", nil case name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"): return "no", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil { t.Fatalf("expected quorum time-sync success, got %v", err) } }) t.Run("startup-stability-failure-matrix", func(t *testing.T) { baseCfg := lifecycleConfig(t) baseCfg.Startup.RequireIngressChecklist = false baseCfg.Startup.RequireServiceChecklist = false baseCfg.Startup.RequireWorkloadConvergence = false baseCfg.Startup.RequireFluxHealth = false runPodsCrash := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get pods -A -o json") { return `{"items":[{"metadata":{"namespace":"default","name":"bad-pod"},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}]}`, nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchCrash, _ := newHookOrchestrator(t, baseCfg, runPodsCrash, runPodsCrash) if err := orchCrash.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "pods in crash/image-pull failures") { t.Fatalf("expected crashloop stability failure, got %v", err) } cfgFlux := baseCfg cfgFlux.Startup.RequireFluxHealth = true runFlux := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") { return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"syncing"}]}}]}`, nil } if name == "kubectl" && strings.Contains(command, "get pods -A -o json") { return `{"items":[]}`, nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchFlux, _ := newHookOrchestrator(t, cfgFlux, runFlux, runFlux) if err := orchFlux.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "flux not ready") { t.Fatalf("expected flux-not-ready stability failure, got %v", err) } cfgWork := baseCfg cfgWork.Startup.RequireWorkloadConvergence = true runWork := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"): return `{"items":[{"kind":"Deployment","metadata":{"namespace":"default","name":"app"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil case name == "kubectl" && strings.Contains(command, "get pods -A -o json"): return `{"items":[]}`, nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchWork, _ := newHookOrchestrator(t, cfgWork, runWork, runWork) if err := orchWork.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "workloads not converged") { t.Fatalf("expected workload convergence stability failure, got %v", err) } cfgService := baseCfg cfgService.Startup.RequireServiceChecklist = true cfgService.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ {Name: "api", URL: "http://127.0.0.1:1/health", AcceptedStatuses: []int{200}, TimeoutSeconds: 1}, } runService := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get pods -A -o json") { return `{"items":[]}`, nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchService, _ := newHookOrchestrator(t, cfgService, runService, runService) if err := orchService.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "external services not healthy") { t.Fatalf("expected service checklist stability failure, got %v", err) } }) } // TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T). // Why: targets low branch density in flux-health, scaling snapshot handling, // and report sanitization helpers. func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) { t.Run("flux-helper-matrix", func(t *testing.T) { if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") { t.Fatalf("expected immutable matcher true for uppercase+job variant") } if cluster.TestHookLooksLikeImmutableJobError("totally unrelated error") { t.Fatalf("expected immutable matcher false") } if !cluster.TestHookJobLooksFluxManaged("flux-system", "job-a", map[string]string{"kustomize.toolkit.fluxcd.io/name": "services"}, nil) { t.Fatalf("expected flux-managed job by kustomize label") } if cluster.TestHookJobFailed(1, 0, []string{"Failed"}, []string{"True"}) != true { t.Fatalf("expected Failed=True to mark job failed") } cfg := lifecycleConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"): return `{"items":[{"metadata":{"namespace":"flux-system","name":"job-a","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "delete job -n flux-system job-a"): return "", nil case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"): return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) healed, err := orch.TestHookHealImmutableFluxJobs(context.Background()) if err != nil || !healed { t.Fatalf("expected immutable flux job heal success, healed=%t err=%v", healed, err) } if _, _, err := orch.TestHookAdaptiveFluxHealthWait(context.Background(), 2*time.Second); err != nil { t.Fatalf("expected adaptive flux wait success, got %v", err) } }) t.Run("scaling-snapshot-branch-matrix", func(t *testing.T) { cfg := lifecycleConfig(t) orch, _ := newHookOrchestrator(t, cfg, nil, nil) if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil { t.Fatalf("expected empty snapshot restore success, got %v", err) } stateFile := filepath.Join(t.TempDir(), "state-file") if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil { t.Fatalf("write state-file: %v", err) } cfgWriteErr := lifecycleConfig(t) cfgWriteErr.State.Dir = stateFile orchWriteErr := cluster.New(cfgWriteErr, &execx.Runner{DryRun: false}, state.New(cfgWriteErr.State.RunHistoryPath), log.New(io.Discard, "", 0)) dispatch := lifecycleDispatcher(&commandRecorder{}) orchWriteErr.SetCommandOverrides(dispatch, dispatch) if err := orchWriteErr.TestHookWriteScaledWorkloadSnapshot(nil); err == nil || !strings.Contains(err.Error(), "ensure state dir") { t.Fatalf("expected scaled snapshot mkdir failure, got %v", err) } }) t.Run("report-sanitize-and-checklist-host-parsers", func(t *testing.T) { got := cluster.TestHookSanitizeReportFileName(" Startup / Drill : Night#2 ") if got == "" || strings.Contains(got, " ") || strings.Contains(got, "/") { t.Fatalf("unexpected sanitized report filename: %q", got) } cfg := lifecycleConfig(t) orch, _ := newHookOrchestrator(t, cfg, nil, nil) host := orch.TestHookChecklistFailureHost("metrics.bstein.dev: GET https://metrics.bstein.dev/: EOF") if host != "metrics.bstein.dev" { t.Fatalf("expected checklist failure host extraction, got %q", host) } }) } // TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T). // Why: raises branch coverage on vault/key and coordination helpers without // requiring package-local tests. func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) { t.Run("vault-unseal-and-file-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.VaultUnsealKeyFile = "" orch, _ := newHookOrchestrator(t, cfg, nil, nil) if err := orch.TestHookWriteVaultUnsealKeyFile("abc"); err == nil || !strings.Contains(err.Error(), "path is empty") { t.Fatalf("expected empty vault key path error") } if cluster.TestHookIsNotFoundErr("") { t.Fatalf("expected nil/notfound helper false on empty input") } if !cluster.TestHookIsNotFoundErr("resource not found") { t.Fatalf("expected notfound helper true for notfound text") } runPhase := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") { return "Pending", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchPhase, _ := newHookOrchestrator(t, lifecycleConfig(t), runPhase, runPhase) if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") { t.Fatalf("expected vault phase gate error, got %v", err) } }) t.Run("coordination-peers-and-snapshot-stat-error", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Coordination.PeerHosts = []string{" titan-24 ", "titan-24", " ", "titan-jh"} orch, _ := newHookOrchestrator(t, cfg, nil, nil) peers := orch.TestHookCoordinationPeers() if len(peers) != 2 || peers[0] != "titan-24" || peers[1] != "titan-jh" { t.Fatalf("unexpected normalized peers: %v", peers) } run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "stat -c %s") { return "", errors.New("stat failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchErr, _ := newHookOrchestrator(t, lifecycleConfig(t), run, run) if err := orchErr.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/snap/path"); err == nil || !strings.Contains(err.Error(), "stat failed") { t.Fatalf("expected snapshot stat error branch, got %v", err) } }) } // TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T). // Why: expands low branch coverage in workload ignore helpers and startup-failure // pod classification. func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) { t.Run("ignored-node-helper-matrix", func(t *testing.T) { if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) { t.Fatalf("expected selector-host ignored match") } if cluster.TestHookWorkloadTargetsIgnoredNodes("titan-23", []string{"titan-24"}, []string{"titan-22"}) { t.Fatalf("expected workload targets ignored false when no ignored host targeted") } if !cluster.TestHookWorkloadTargetsIgnoredNodes("", []string{"titan-22"}, []string{"titan-22"}) { t.Fatalf("expected affinity host ignored match") } if !cluster.TestHookPodTargetsIgnoredNode("titan-22", []string{"titan-22"}) { t.Fatalf("expected pod ignored-node match") } if cluster.TestHookPodTargetsIgnoredNode("titan-23", []string{"titan-22"}) { t.Fatalf("expected pod ignored-node mismatch") } }) t.Run("startup-failure-pods-decode-error-and-success", func(t *testing.T) { cfg := lifecycleConfig(t) runBad := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get pods -A -o json") { return "{bad json", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchBad, _ := newHookOrchestrator(t, cfg, runBad, runBad) if _, err := orchBad.TestHookStartupFailurePods(context.Background()); err == nil { t.Fatalf("expected startupFailurePods decode error") } runOK := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get pods -A -o json") { return `{"items":[{"metadata":{"namespace":"default","name":"ok-pod"},"status":{"containerStatuses":[{"state":{"running":{}}}]}}]}`, nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchOK, _ := newHookOrchestrator(t, cfg, runOK, runOK) failures, err := orchOK.TestHookStartupFailurePods(context.Background()) if err != nil || len(failures) != 0 { t.Fatalf("expected no startup failures, failures=%v err=%v", failures, err) } }) t.Run("stuck-vault-init-reason-matrix", func(t *testing.T) { if got := cluster.TestHookStuckVaultInitReason("Running", true, 0, 10*time.Second); got != "" { t.Fatalf("expected no stuck init reason without running init, got %q", got) } if got := cluster.TestHookStuckVaultInitReason("Pending", true, 30*time.Second, 10*time.Second); !strings.Contains(got, "VaultInitStuck") { t.Fatalf("expected stuck vault init reason, got %q", got) } }) }