package orchestrator import ( "context" "errors" "io" "log" "net/http" "net/http/httptest" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestHookGapMatrixPart4CoordinationAndReachability runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T). // Why: closes remaining coordination/reachability low branches with deterministic // command responses and short timeouts. func TestHookGapMatrixPart4CoordinationAndReachability(t *testing.T) { t.Run("peer-shutdown-complete-cooldown-blocks-startup", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Coordination.PeerHosts = []string{"titan-24"} cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24") cfg.SSHNodeHosts["titan-24"] = "titan-24" base := lifecycleDispatcher(&commandRecorder{}) now := time.Now().UTC().Format(time.RFC3339) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") { return "__ANANKE_BOOTSTRAP_IDLE__\nintent=shutdown_complete reason=\"recent\" source=peer updated_at=" + now + "\n", nil } return base(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.TestHookGuardPeerStartupIntents(context.Background()) if err == nil || !strings.Contains(err.Error(), "completed shutdown too recently") { t.Fatalf("expected shutdown-complete cooldown block, got %v", err) } }) t.Run("peer-startup-stale-auto-clears-when-bootstrap-idle", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Coordination.PeerHosts = []string{"titan-24"} cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24") cfg.SSHNodeHosts["titan-24"] = "titan-24" cfg.Coordination.StartupGuardMaxAgeSec = 30 stale := time.Now().UTC().Add(-3 * time.Hour).Format(time.RFC3339) base := lifecycleDispatcher(&commandRecorder{}) clearCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"): clearCalls++ return "ok", nil case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"): return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + stale + "\n", nil default: return base(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil { t.Fatalf("expected stale peer startup intent to auto-clear, got %v", err) } if clearCalls == 0 { t.Fatalf("expected remote stale-intent clear call") } }) t.Run("read-peer-parse-error-api-timeout-and-snapshot-size-parse", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Coordination.PeerHosts = []string{"titan-24"} cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24") cfg.SSHNodeHosts["titan-24"] = "titan-24" base := lifecycleDispatcher(&commandRecorder{}) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"): return "__ANANKE_BOOTSTRAP_IDLE__\nnot-an-intent-payload\n", nil case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): return "", errors.New("api unreachable") case name == "ssh" && strings.Contains(command, "stat -c %s"): return "not-a-size", nil default: return base(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if _, err := orch.TestHookReadRemotePeerStatus(context.Background(), "titan-24"); err == nil { t.Fatalf("expected parse failure for remote peer intent output") } if err := orch.TestHookWaitForAPI(context.Background(), 1, 0); err == nil { t.Fatalf("expected waitForAPI timeout error") } err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown") if err == nil || !strings.Contains(err.Error(), "parse size") { t.Fatalf("expected snapshot size parse error, got %v", err) } }) t.Run("inventory-reachability-times-out-on-unexpected-output", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireNodeInventoryReach = true cfg.Startup.NodeInventoryReachWaitSeconds = 1 cfg.Startup.NodeInventoryReachPollSeconds = 1 base := lifecycleDispatcher(&commandRecorder{}) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") { return "unexpected", nil } return base(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.TestHookWaitForNodeInventoryReachability(context.Background()) if err == nil || !strings.Contains(err.Error(), "unexpected output") { t.Fatalf("expected unexpected-output timeout branch, got %v", err) } }) } // TestHookGapMatrixPart4IngressServiceAndPostStart runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T). // Why: drives ingress/service checklist and post-start branches that were still // under-covered after drill-focused matrix tests. func TestHookGapMatrixPart4IngressServiceAndPostStart(t *testing.T) { t.Run("ingress-backend-autoscale-cooldown-and-host-parser", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ {Name: "metrics", URL: "https://metrics.bstein.dev/api/health"}, } cfg.Startup.IngressChecklistIgnoreHosts = []string{"ignore.bstein.dev"} base := lifecycleDispatcher(&commandRecorder{}) scaleCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"): return `{"items":[{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"rules":[{"host":"metrics.bstein.dev"},{"host":"ignore.bstein.dev"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"): return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0},"status":{"readyReplicas":0}}]}`, nil case name == "kubectl" && strings.Contains(command, " scale deployment grafana --replicas=1"): scaleCalls++ return "", nil default: return base(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) last := time.Time{} orch.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &last, "metrics: unexpected status code=502") orch.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &last, "metrics: still bad") if scaleCalls != 1 { t.Fatalf("expected one scale call due cooldown gate, got %d", scaleCalls) } if got := orch.TestHookChecklistFailureHost("metrics: request failed"); got != "metrics.bstein.dev" { t.Fatalf("expected mapped host metrics.bstein.dev, got %q", got) } if got := orch.TestHookChecklistFailureHost("not-a-host detail"); got != "" { t.Fatalf("expected empty host for unknown failure prefix, got %q", got) } if !cluster.TestHookChecklistContains("hello \n world", "HELLO WORLD") { t.Fatalf("expected compact checklist matcher branch") } }) t.Run("service-check-body-notcontains-and-poststart-timeout", func(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte("metrics ready marker")) })) defer srv.Close() cfg := lifecycleConfig(t) orch, _ := newHookOrchestrator(t, cfg, nil, nil) ok, detail := orch.TestHookServiceCheckReady(context.Background(), config.ServiceChecklistCheck{ Name: "forbidden-marker", URL: srv.URL, AcceptedStatuses: []int{200}, BodyNotContains: "marker", TimeoutSeconds: 2, }) if ok || !strings.Contains(detail, "forbidden marker") { t.Fatalf("expected forbidden-marker branch, ok=%v detail=%q", ok, detail) } cfg = lifecycleConfig(t) cfg.Startup.PostStartProbeWaitSeconds = 1 cfg.Startup.PostStartProbePollSeconds = 1 cfg.Startup.PostStartProbes = []string{"https://metrics.bstein.dev/health"} base := lifecycleDispatcher(&commandRecorder{}) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "curl" || (name == "kubectl" && strings.Contains(command, "curl")) { return "500", nil } return base(ctx, timeout, name, args...) } orch, _ = newHookOrchestrator(t, cfg, run, run) err := orch.TestHookWaitForPostStartProbes(context.Background()) if err == nil || !strings.Contains(err.Error(), "post-start probes did not pass") { t.Fatalf("expected post-start timeout branch, got %v", err) } }) t.Run("hostname-heuristic-negative-cases", func(t *testing.T) { cases := []string{"", "not-a-host", "metrics bstein dev", "metrics.bstein.dev/path"} for _, in := range cases { if cluster.TestHookIsLikelyHostname(in) { t.Fatalf("expected %q to be treated as non-hostname", in) } } }) } // TestHookGapMatrixPart4ReportScalingStorageDrain runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T). // Why: covers artifact, scaling snapshot, storage, and drain error branches that // are difficult to hit from happy-path lifecycle drills. func TestHookGapMatrixPart4ReportScalingStorageDrain(t *testing.T) { t.Run("report-artifact-and-progress-error-paths", func(t *testing.T) { cfg := lifecycleConfig(t) reportsFile := filepath.Join(t.TempDir(), "reports-as-file") if err := os.WriteFile(reportsFile, []byte("x"), 0o644); err != nil { t.Fatalf("create reports file: %v", err) } cfg.State.ReportsDir = reportsFile orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.TestHookWriteRunRecordArtifact(state.RunRecord{ ID: "shutdown-record", Action: "shutdown", Reason: "drill", StartedAt: time.Now().UTC(), EndedAt: time.Now().UTC(), }) if err == nil { t.Fatalf("expected report archive dir error") } if err := orch.TestHookWriteStartupReportFile(filepath.Join(reportsFile, "startup.json"), "running"); err == nil { t.Fatalf("expected startup report path mkdir error") } cfg2 := lifecycleConfig(t) cfg2.State.Dir = filepath.Join(t.TempDir(), "state") cfg2.State.ReportsDir = reportsFile orch2, _ := newHookOrchestrator(t, cfg2, nil, nil) orch2.TestHookPersistStartupProgress("running") orch2.TestHookBeginStartupReport("drill") orch2.TestHookFinalizeStartupReport(errors.New("boom")) }) t.Run("scaled-workload-snapshot-write-and-read-error-paths", func(t *testing.T) { cfg := lifecycleConfig(t) stateFile := filepath.Join(t.TempDir(), "state-file") if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil { t.Fatalf("create state file: %v", err) } cfg.State.Dir = filepath.Join(stateFile, "nested") orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) dispatch := lifecycleDispatcher(&commandRecorder{}) orch.SetCommandOverrides(dispatch, dispatch) entries, err := orch.TestHookListScalableWorkloads(context.Background()) if err != nil { t.Fatalf("list scalable workloads: %v", err) } err = orch.TestHookWriteScaledWorkloadSnapshot(entries[:1]) if err == nil || !strings.Contains(err.Error(), "ensure state dir") { t.Fatalf("expected scaled snapshot state-dir failure, got %v", err) } cfg2 := lifecycleConfig(t) orch2, _ := newHookOrchestrator(t, cfg2, nil, nil) snapshotPath := filepath.Join(cfg2.State.Dir, "scaled-workloads.json") if err := os.WriteFile(snapshotPath, []byte("{bad"), 0o644); err != nil { t.Fatalf("write corrupt snapshot: %v", err) } if _, err := orch2.TestHookReadScaledWorkloadSnapshot(); err == nil { t.Fatalf("expected corrupt snapshot decode error") } }) t.Run("storage-and-drain-failure-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.StorageReadyWaitSeconds = 1 cfg.Startup.StorageReadyPollSeconds = 1 cfg.Startup.StorageMinReadyNodes = 3 cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"} cfg.Shutdown.DrainParallelism = 1 base := lifecycleDispatcher(&commandRecorder{}) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"): return "titan-23:True:True\ntitan-24:False:False\n", nil case name == "kubectl" && strings.Contains(command, " drain titan-23 "): return "", errors.New("drain blocked") case name == "kubectl" && strings.Contains(command, "--field-selector spec.nodeName=titan-23"): return "monitoring grafana-0 Running ReplicaSet\n", nil default: return base(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if _, _, err := orch.TestHookStorageReady(context.Background()); err != nil { t.Fatalf("expected storageReady non-error not-ready branch, got %v", err) } if err := orch.TestHookWaitForStorageReady(context.Background()); err == nil { t.Fatalf("expected storage readiness timeout") } err := orch.TestHookDrainWorkers(context.Background(), []string{"titan-23"}) if err == nil || !strings.Contains(err.Error(), "details:") { t.Fatalf("expected drain diagnostics branch, got %v", err) } }) } // TestHookGapMatrixPart4TimesyncLifecycleAndAccess runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T). // Why: closes remaining timing/access/lifecycle branches that still sat below // target after the earlier matrices. func TestHookGapMatrixPart4TimesyncLifecycleAndAccess(t *testing.T) { t.Run("timesync-quorum-and-strict-failure-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.TimeSyncMode = "quorum" cfg.Startup.TimeSyncQuorum = 1 cfg.Startup.TimeSyncWaitSeconds = 1 cfg.Startup.TimeSyncPollSeconds = 1 base := lifecycleDispatcher(&commandRecorder{}) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "yes", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized") && strings.Contains(command, "titan-db"): return "yes", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized") && strings.Contains(command, "titan-23"): return "no", nil default: return base(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil { t.Fatalf("expected quorum-mode success, got %v", err) } cfgStrict := lifecycleConfig(t) cfgStrict.Startup.TimeSyncMode = "strict" cfgStrict.Startup.TimeSyncWaitSeconds = 1 cfgStrict.Startup.TimeSyncPollSeconds = 1 runStrict := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if (name == "sh" || name == "ssh") && strings.Contains(command, "timedatectl show -p NTPSynchronized") { return "no", nil } return base(ctx, timeout, name, args...) } orchStrict, _ := newHookOrchestrator(t, cfgStrict, runStrict, runStrict) err := orchStrict.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"}) if err == nil || !strings.Contains(err.Error(), "time sync not ready") { t.Fatalf("expected strict-mode timesync failure, got %v", err) } }) t.Run("validate-inventory-and-access-guard-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.SSHPort = 70000 cfg.SSHManagedNodes = []string{"titan-db"} cfg.Workers = []string{"titan-23"} cfg.SSHNodeUsers = map[string]string{} cfg.SSHNodeHosts["titan-db"] = "bad/host" cfg.SSHNodeUsers["titan-db"] = "bad user" orch, _ := newHookOrchestrator(t, cfg, nil, nil) if err := orch.TestHookValidateNodeInventory(); err == nil { t.Fatalf("expected inventory validation failure") } base := lifecycleDispatcher(&commandRecorder{}) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-23"): return "", errors.New("permission denied") case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "feature/sso", nil default: return base(ctx, timeout, name, args...) } } cfg2 := lifecycleConfig(t) orch2, _ := newHookOrchestrator(t, cfg2, run, run) if err := orch2.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil { t.Fatalf("expected reconcileNodeAccess aggregated error") } if err := orch2.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil { t.Fatalf("expected ensureFluxBranch mismatch guard") } }) t.Run("lifecycle-restore-and-mode-guard-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = []string{"titan-db"} cfg.SSHManagedNodes = []string{"titan-db"} orch, _ := newHookOrchestrator(t, cfg, nil, nil) if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "not-a-cp"}); err == nil { t.Fatalf("expected etcd restore control-plane membership guard") } cfgDry := lifecycleConfig(t) cfgDry.ControlPlanes = []string{"titan-db"} cfgDry.SSHManagedNodes = []string{"titan-db"} orchDry := newDryRunHookOrchestrator(t, cfgDry, nil) if err := orchDry.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err != nil { t.Fatalf("expected dry-run etcd restore success, got %v", err) } cfgMode := lifecycleConfig(t) orchMode, _ := newHookOrchestrator(t, cfgMode, nil, nil) if err := orchMode.Shutdown(context.Background(), cluster.ShutdownOptions{Mode: "poweroff"}); err == nil { t.Fatalf("expected removed shutdown mode guard") } }) }