package orchestrator import ( "context" "io" "log" "net/http" "net/http/httptest" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // wrapperCoverageDispatcher runs one orchestration or CLI step. // Signature: wrapperCoverageDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error). // Why: centralizes deterministic command output so hook-wrapper tests can execute all exported test hooks without live cluster access. func wrapperCoverageDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error) { base := lifecycleDispatcher(recorder) return func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "curl": recorder.record(name, args) return "200", nil case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"): recorder.record(name, args) return "Name Size Created Location\npre-shutdown 4.2M now \"file:///var/lib/rancher/k3s/server/db/snapshots/pre-shutdown\"\n", nil case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot save"): recorder.record(name, args) return "snapshot saved", nil case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"): recorder.record(name, args) return `{"items":[{"metadata":{"namespace":"monitoring"},"spec":{"rules":[{"host":"metrics.bstein.dev"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get endpoints victoria-metrics-single-server"): recorder.record(name, args) return "10.42.0.10\n", nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"): recorder.record(name, args) return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"titan-23"}}}},"status":{"readyReplicas":1}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}},{"kind":"DaemonSet","metadata":{"namespace":"monitoring","name":"node-exporter"},"spec":{"template":{"spec":{}}},"status":{"desiredNumberScheduled":1,"numberReady":1}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"): recorder.record(name, args) return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0,"template":{"spec":{}}},"status":{"readyReplicas":0}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil case name == "kubectl" && strings.Contains(command, "get pods -A -o json"): recorder.record(name, args) return `{"items":[{"metadata":{"namespace":"vault","name":"vault-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"StatefulSet","name":"vault"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"vault"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"vault-agent-init","state":{"running":{"startedAt":"2020-01-01T00:00:00Z"}}}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="): recorder.record(name, args) return "monitoring\tgrafana\t1\nflux-system\tsource-controller\t1\ngitea\tgitea\t1\n", nil case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="): recorder.record(name, args) return "monitoring\tvictoria-metrics-single-server\t1\nvault\tvault\t1\npostgres\tpostgres\t1\n", nil case name == "kubectl" && strings.Contains(command, "get nodes -o custom-columns="): recorder.record(name, args) return "titan-23 \n", nil case name == "kubectl" && strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath="): recorder.record(name, args) return "services\n", nil case name == "kubectl" && strings.Contains(command, "get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath="): recorder.record(name, args) return "monitoring/grafana\n", nil case name == "kubectl" && strings.Contains(command, "get statefulset vault -o jsonpath={.status.readyReplicas}"): recorder.record(name, args) return "1", nil case name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}"): recorder.record(name, args) return "1", nil case name == "kubectl" && strings.Contains(command, "rollout status"): recorder.record(name, args) return "rolled out", nil case name == "kubectl" && strings.Contains(command, "scale "): recorder.record(name, args) return "", nil case name == "kubectl" && strings.Contains(command, "delete pod"): recorder.record(name, args) return "", nil case name == "kubectl" && strings.Contains(command, "patch "): recorder.record(name, args) return "", nil default: return base(ctx, timeout, name, args...) } } } // TestHookLifecycleWrappersCoverage runs one orchestration or CLI step. // Signature: TestHookLifecycleWrappersCoverage(t *testing.T). // Why: executes lifecycle hook wrappers so test-hook files and their orchestration backends stay covered from the top-level testing module. func TestHookLifecycleWrappersCoverage(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.PostStartProbes = []string{"https://metrics.bstein.dev/healthz"} cfg.Startup.PostStartProbeWaitSeconds = 1 cfg.Startup.PostStartProbePollSeconds = 1 cfg.Startup.RequireIngressChecklist = false cfg.Startup.RequireServiceChecklist = false cfg.Startup.RequireCriticalServiceEndpoints = false cfg.Startup.RequireFluxHealth = false cfg.Startup.RequireWorkloadConvergence = false cfg.Startup.ServiceChecklistStabilitySec = 1 cfg.Startup.IngressChecklistWaitSeconds = 1 cfg.Startup.IngressChecklistPollSeconds = 1 cfg.Startup.ServiceChecklistWaitSeconds = 1 cfg.Startup.ServiceChecklistPollSeconds = 1 cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/victoria-metrics-single-server"} cfg.Startup.CriticalServiceEndpointWaitSec = 1 cfg.Startup.CriticalServiceEndpointPollSec = 1 svc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte(`{"database":"ok"}`)) })) defer svc.Close() cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ { Name: "grafana", URL: svc.URL, AcceptedStatuses: []int{200}, BodyContains: `"database":"ok"`, TimeoutSeconds: 2, }, } recorder := &commandRecorder{} dispatch := wrapperCoverageDispatcher(recorder) orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(dispatch, dispatch) ctx := context.Background() if err := orch.TestHookWaitForPostStartProbes(ctx); err != nil { t.Fatalf("wait for post-start probes: %v", err) } if _, _ = orch.TestHookPostStartProbesReady(ctx); true { } if _, err := orch.TestHookHTTPProbe(ctx, "https://metrics.bstein.dev/healthz"); err != nil { t.Fatalf("http probe: %v", err) } if err := orch.TestHookResumeFluxAndReconcile(ctx); err != nil { t.Fatalf("resume flux and reconcile: %v", err) } if _, err := orch.TestHookSSHWithTimeout(ctx, "titan-db", "echo ok", time.Second); err != nil { t.Fatalf("ssh with timeout: %v", err) } if _, err := orch.TestHookRunSensitive(ctx, time.Second, "kubectl", "version", "--request-timeout=5s"); err != nil { t.Fatalf("runSensitive wrapper: %v", err) } if err := orch.TestHookWaitForStartupConvergence(ctx); err != nil { t.Fatalf("wait for startup convergence: %v", err) } if err := orch.TestHookWaitForServiceChecklist(ctx); err != nil { t.Fatalf("wait for service checklist: %v", err) } if ok, detail := orch.TestHookServiceChecklistReady(ctx); !ok || detail == "" { t.Fatalf("service checklist ready result unexpected: ok=%v detail=%q", ok, detail) } if ok, detail := orch.TestHookServiceCheckReady(ctx, cfg.Startup.ServiceChecklist[0]); !ok || detail == "" { t.Fatalf("service check ready result unexpected: ok=%v detail=%q", ok, detail) } if status, body, err := orch.TestHookHTTPChecklistProbe(ctx, cfg.Startup.ServiceChecklist[0]); err != nil || status != 200 || body == "" { t.Fatalf("http checklist probe unexpected result status=%d body=%q err=%v", status, body, err) } if err := orch.TestHookWaitForStabilityWindow(ctx); err != nil { t.Fatalf("wait for stability window: %v", err) } if err := orch.TestHookStartupStabilityHealthy(ctx); err != nil { t.Fatalf("startup stability healthy: %v", err) } cancelCtx, cancel := context.WithCancel(ctx) cancel() _ = orch.TestHookWaitForIngressChecklist(cancelCtx) if _, _ = orch.TestHookIngressChecklistReady(ctx); true { } if hosts, err := orch.TestHookDiscoverIngressHosts(ctx); err != nil || len(hosts) == 0 { t.Fatalf("discover ingress hosts failed: hosts=%v err=%v", hosts, err) } if namespaces, err := orch.TestHookDiscoverIngressNamespacesForHost(ctx, "metrics.bstein.dev"); err != nil || len(namespaces) == 0 { t.Fatalf("discover ingress namespaces failed: ns=%v err=%v", namespaces, err) } last := time.Time{} orch.TestHookMaybeAutoHealIngressHostBackends(ctx, &last, "metrics.bstein.dev: status=503") if got := orch.TestHookChecklistFailureHost("metrics.bstein.dev: status=503"); got != "metrics.bstein.dev" { t.Fatalf("unexpected checklistFailureHost parse: %q", got) } if got := cluster.TestHookHostFromURL("https://metrics.bstein.dev/api/health"); got != "metrics.bstein.dev" { t.Fatalf("unexpected host parse: %q", got) } } // TestHookOpsAndWorkloadWrappersCoverage runs one orchestration or CLI step. // Signature: TestHookOpsAndWorkloadWrappersCoverage(t *testing.T). // Why: executes ops/workload wrappers so all exposed hook files remain covered and regression-safe. func TestHookOpsAndWorkloadWrappersCoverage(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.WorkloadConvergenceWaitSeconds = 1 cfg.Startup.WorkloadConvergencePollSeconds = 1 cfg.Startup.CriticalServiceEndpointWaitSec = 1 cfg.Startup.CriticalServiceEndpointPollSec = 1 cfg.Startup.AutoRecycleStuckPods = true cfg.Startup.StuckPodGraceSeconds = 1 svc := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte(`{"status":"ok"}`)) })) defer svc.Close() cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ {Name: "grafana", URL: svc.URL, AcceptedStatuses: []int{200}, BodyContains: `"status":"ok"`, TimeoutSeconds: 2}, } cfg.Startup.ServiceChecklistWaitSeconds = 1 cfg.Startup.ServiceChecklistPollSeconds = 1 recorder := &commandRecorder{} dispatch := wrapperCoverageDispatcher(recorder) orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(dispatch, dispatch) ctx := context.Background() if err := orch.TestHookDrainWorkers(ctx, []string{"titan-23"}); err != nil { t.Fatalf("drain workers: %v", err) } if err := orch.TestHookUncordonWorkers(ctx, []string{"titan-23"}); err != nil { t.Fatalf("uncordon workers: %v", err) } if err := orch.TestHookTakeEtcdSnapshot(ctx, "titan-db"); err != nil { t.Fatalf("take etcd snapshot: %v", err) } if snapshot, err := orch.TestHookLatestEtcdSnapshotPath(ctx, "titan-db"); err != nil || snapshot == "" { t.Fatalf("latest snapshot path failed: snapshot=%q err=%v", snapshot, err) } if workers, err := orch.TestHookEffectiveWorkers(ctx); err != nil || len(workers) == 0 { t.Fatalf("effective workers failed: workers=%v err=%v", workers, err) } if workers, err := orch.TestHookDiscoverWorkers(ctx); err != nil || len(workers) == 0 { t.Fatalf("discover workers failed: workers=%v err=%v", workers, err) } if err := orch.TestHookPatchFluxSuspendAll(ctx, true); err != nil { t.Fatalf("patch flux suspend: %v", err) } if entries, err := orch.TestHookListScalableWorkloads(ctx); err != nil { t.Fatalf("list scalable workloads: %v", err) } else { if err := orch.TestHookScaleWorkloads(ctx, entries, 0, 1); err != nil { t.Fatalf("scale workloads forced: %v", err) } if err := orch.TestHookWriteScaledWorkloadSnapshot(entries); err != nil { t.Fatalf("write scaled workload snapshot: %v", err) } if _, err := orch.TestHookReadScaledWorkloadSnapshot(); err != nil { t.Fatalf("read scaled workload snapshot: %v", err) } } if err := orch.TestHookScaleDownApps(ctx); err != nil { t.Fatalf("scale down apps: %v", err) } if err := orch.TestHookRestoreScaledApps(ctx); err != nil { t.Fatalf("restore scaled apps: %v", err) } if err := orch.TestHookWaitForCriticalServiceEndpoints(ctx); err != nil { t.Fatalf("wait for critical service endpoints: %v", err) } if ok, _, _, _, err := orch.TestHookCriticalServiceEndpointsReady(ctx); err != nil || !ok { t.Fatalf("critical endpoints ready failed: ok=%v err=%v", ok, err) } if err := orch.TestHookWaitForWorkloadConvergence(ctx); err != nil { t.Fatalf("wait for workload convergence: %v", err) } if ok, detail, err := orch.TestHookWorkloadConvergenceReady(ctx); err != nil || !ok || detail == "" { t.Fatalf("workload convergence ready failed: ok=%v detail=%q err=%v", ok, detail, err) } if err := orch.TestHookRecycleStuckControllerPods(ctx); err != nil { t.Fatalf("recycle stuck controller pods: %v", err) } lastRecycle := time.Time{} orch.TestHookMaybeAutoRecycleStuckPods(ctx, &lastRecycle) lastHeal := time.Time{} orch.TestHookMaybeAutoHealCriticalWorkloadReplicas(ctx, &lastHeal) if _, err := orch.TestHookHealCriticalWorkloadReplicas(ctx); err != nil { t.Fatalf("heal critical workload replicas: %v", err) } if _, err := orch.TestHookStartupFailurePods(ctx); err != nil { t.Fatalf("startup failure pods: %v", err) } if err := orch.TestHookEnsureCriticalStartupWorkloads(ctx); err != nil { t.Fatalf("ensure critical startup workloads: %v", err) } if _, err := orch.TestHookMissingCriticalStartupWorkloads(ctx); err != nil { t.Fatalf("missing critical startup workloads: %v", err) } if err := orch.TestHookWaitForServiceChecklistAlias(ctx); err != nil { t.Fatalf("wait for service checklist alias: %v", err) } }