package orchestrator import ( "context" "encoding/base64" "errors" "fmt" "io" "log" "net" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestHookGapMatrixPart8CoverageClosure runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart8CoverageClosure(t *testing.T). // Why: closes additional low-coverage branches in access, vault, lifecycle, // ingress/service stability, and timesync/inventory orchestration paths. func TestHookGapMatrixPart8CoverageClosure(t *testing.T) { t.Run("access-ssh-and-branch-guard-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireNodeSSHAuth = true cfg.Startup.NodeSSHAuthWaitSeconds = 1 cfg.Startup.NodeSSHAuthPollSeconds = 1 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__"): return "", errors.New("Permission denied (publickey)") case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version"): return "", errors.New("sudo: a password is required") case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}"): return cfg.ExpectedFluxSource, nil case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "legacy", nil case name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system"): return "", errors.New("patch denied") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "auth denied") { t.Fatalf("expected ssh auth denied branch, got %v", err) } if err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "access validation had") { t.Fatalf("expected reconcile access error aggregation, got %v", err) } if err := orch.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil || !strings.Contains(err.Error(), "startup blocked") { t.Fatalf("expected ensureFluxBranch block branch, got %v", err) } if err := orch.TestHookEnsureFluxBranch(context.Background(), "main", true); err == nil || !strings.Contains(err.Error(), "set flux source branch") { t.Fatalf("expected ensureFluxBranch patch failure branch, got %v", err) } cfgCache := lifecycleConfig(t) cfgCache.IACRepoPath = t.TempDir() cfgCache.LocalBootstrapPaths = []string{"missing-path"} orchCache, _ := newHookOrchestrator(t, cfgCache, nil, nil) if err := orchCache.TestHookRefreshBootstrapCache(context.Background()); err == nil || !strings.Contains(err.Error(), "no bootstrap cache manifests rendered") { t.Fatalf("expected refresh cache zero-rendered branch, got %v", err) } }) t.Run("coordination-and-snapshot-verification-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Coordination.PeerHosts = []string{"titan-24"} cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24") cfg.SSHNodeHosts["titan-24"] = "titan-24" now := time.Now().UTC().Format(time.RFC3339) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"): return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=startup_in_progress reason=\"rolling\" source=peer updated_at=" + now + "\n", nil case name == "ssh" && strings.Contains(command, "stat -c %s"): return "abc", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err == nil || !strings.Contains(err.Error(), "startup_in_progress") { t.Fatalf("expected peer startup-in-progress block branch, got %v", err) } if err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"); err == nil || !strings.Contains(err.Error(), "parse size") { t.Fatalf("expected verify snapshot parse-size branch, got %v", err) } runSmall := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "stat -c %s"): return "128", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchSmall, _ := newHookOrchestrator(t, cfg, runSmall, runSmall) if err := orchSmall.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown"); err == nil || !strings.Contains(err.Error(), "too small") { t.Fatalf("expected verify snapshot too-small branch, got %v", err) } }) t.Run("critical-endpoints-and-vault-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.CriticalServiceEndpoints = []string{"bad-entry"} orchInvalid, _ := newHookOrchestrator(t, cfg, nil, nil) if _, _, _, _, err := orchInvalid.TestHookCriticalServiceEndpointsReady(context.Background()); err == nil { t.Fatalf("expected invalid critical endpoint entry branch") } cfg = lifecycleConfig(t) keyB64 := base64.StdEncoding.EncodeToString([]byte("vault-unseal-key")) sealedChecks := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"): return "Running", nil case name == "kubectl" && strings.Contains(command, "vault status -format=json"): sealedChecks++ if sealedChecks <= 2 { return `{"sealed":true}`, nil } return `{"sealed":false}`, nil case name == "kubectl" && strings.Contains(command, "get secret vault-init"): return keyB64, nil case name == "kubectl" && strings.Contains(command, "vault operator unseal"): return "unsealed", nil case name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}"): if sealedChecks == 0 { return "0", nil } return "1", nil case name == "kubectl" && strings.Contains(command, "get pods -o custom-columns"): return "vault-0 Unknown StatefulSet vault\n", nil case name == "kubectl" && strings.Contains(command, "delete pod vault-0"): return "", errors.New("delete failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookEnsureVaultUnsealed(context.Background()); err != nil { t.Fatalf("expected ensureVaultUnsealed success branch, got %v", err) } if err := orch.TestHookWaitVaultReady(context.Background(), "vault", "statefulset", "vault"); err != nil { t.Fatalf("expected waitVaultReady retry/success branch, got %v", err) } if err := orch.TestHookCleanupStaleCriticalWorkloadPods(context.Background(), "vault", "statefulset", "vault"); err == nil || !strings.Contains(err.Error(), "delete stale pod") { t.Fatalf("expected stale-pod delete failure branch, got %v", err) } runBadReady := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}") { return "not-a-number", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchBadReady, _ := newHookOrchestrator(t, lifecycleConfig(t), runBadReady, runBadReady) if _, err := orchBadReady.TestHookWorkloadReady(context.Background(), "monitoring", "deployment", "grafana"); err == nil { t.Fatalf("expected workloadReady parse branch") } cfgEmptyKey := lifecycleConfig(t) cfgEmptyKey.Startup.VaultUnsealKeyFile = "" orchEmptyKey, _ := newHookOrchestrator(t, cfgEmptyKey, nil, nil) if err := orchEmptyKey.TestHookWriteVaultUnsealKeyFile("abc"); err == nil || !strings.Contains(err.Error(), "path is empty") { t.Fatalf("expected writeVaultUnsealKeyFile empty-path branch, got %v", err) } }) t.Run("lifecycle-etcd-restore-and-shutdown-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = []string{} orchNoCP := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) if err := orchNoCP.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil || !strings.Contains(err.Error(), "no control planes") { t.Fatalf("expected etcd restore no-control-planes branch, got %v", err) } cfgManaged := lifecycleConfig(t) cfgManaged.SSHManagedNodes = []string{"titan-db"} runExternal := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://db:5432/k3s", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchExternal, _ := newHookOrchestrator(t, cfgManaged, runExternal, runExternal) err := orchExternal.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}) if err == nil || !errors.Is(err, cluster.ErrEtcdRestoreNotApplicable) { t.Fatalf("expected etcd restore not-applicable branch, got %v", err) } cfgShutdown := lifecycleConfig(t) cfgShutdown.SSHPort = 0 orchShutdown := cluster.New(cfgShutdown, &execx.Runner{DryRun: false}, state.New(cfgShutdown.State.RunHistoryPath), log.New(io.Discard, "", 0)) if err := orchShutdown.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "drill", Mode: "cluster-only"}); err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") { t.Fatalf("expected shutdown inventory-preflight branch, got %v", err) } }) t.Run("ingress-service-stability-and-timesync-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireIngressChecklist = true cfg.Startup.RequireServiceChecklist = true cfg.Startup.RequireCriticalServiceEndpoints = true cfg.Startup.RequireFluxHealth = true cfg.Startup.RequireWorkloadConvergence = true cfg.Startup.ServiceChecklistStabilitySec = 1 cfg.Startup.ServiceChecklistPollSeconds = 1 cfg.Startup.TimeSyncWaitSeconds = 1 cfg.Startup.TimeSyncPollSeconds = 1 cfg.Startup.TimeSyncMode = "quorum" cfg.Startup.TimeSyncQuorum = 1 cfg.Startup.ServiceChecklist = nil run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "yes", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): if strings.Contains(command, "titan-db") { return "yes", nil } return "no", nil case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"): return `{"items":[]}`, nil case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"): return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"): return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil case name == "kubectl" && strings.Contains(command, "get endpoints victoria-metrics-single-server"): return "10.42.0.10\n", nil case name == "kubectl" && strings.Contains(command, "get pods -A -o json"): return `{"items":[]}`, nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookWaitForStartupConvergence(context.Background()); err != nil { t.Fatalf("expected startup convergence pass branch, got %v", err) } if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil { t.Fatalf("expected quorum timesync pass branch, got %v", err) } runDecodeIngress := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get ingress -A -o json") { return "{bad-json", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchDecodeIngress, _ := newHookOrchestrator(t, lifecycleConfig(t), runDecodeIngress, runDecodeIngress) if _, err := orchDecodeIngress.TestHookDiscoverIngressNamespacesForHost(context.Background(), "metrics.bstein.dev"); err == nil { t.Fatalf("expected ingress namespace decode branch") } cfgInventory := lifecycleConfig(t) cfgInventory.SSHPort = 70000 orchInventory, _ := newHookOrchestrator(t, cfgInventory, nil, nil) if err := orchInventory.TestHookValidateNodeInventory(); err == nil || !strings.Contains(err.Error(), "ssh_port") { t.Fatalf("expected inventory invalid-port branch, got %v", err) } }) t.Run("report-and-scaling-edge-branches", func(t *testing.T) { cfg := lifecycleConfig(t) orch, _ := newHookOrchestrator(t, cfg, nil, nil) rec := state.RunRecord{ID: "shutdown-1", Action: "shutdown", StartedAt: time.Now().UTC()} if err := orch.TestHookWriteRunRecordArtifact(rec); err != nil { t.Fatalf("expected writeRunRecordArtifact shutdown branch, got %v", err) } if _, err := os.Stat(orch.TestHookLastShutdownReportPath()); err != nil { t.Fatalf("expected last shutdown report file: %v", err) } cfgScaleErr := lifecycleConfig(t) stateFile := filepath.Join(t.TempDir(), "state-file") if err := os.WriteFile(stateFile, []byte("x"), 0o600); err != nil { t.Fatalf("write state file: %v", err) } cfgScaleErr.State.Dir = stateFile orchScaleErr := cluster.New( cfgScaleErr, &execx.Runner{DryRun: false}, state.New(cfgScaleErr.State.RunHistoryPath), log.New(io.Discard, "", 0), ) if err := orchScaleErr.TestHookWriteScaledWorkloadSnapshot(nil); err == nil || !strings.Contains(err.Error(), "ensure state dir") { t.Fatalf("expected writeScaledWorkloadSnapshot mkdir branch, got %v", err) } cfgRestore := lifecycleConfig(t) stateDir := cfgRestore.State.Dir run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, " scale ") { return "", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchRestore, _ := newHookOrchestrator(t, cfgRestore, run, run) snapshotPath := filepath.Join(stateDir, "scaled-workloads.json") snapshot := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}` if err := os.WriteFile(snapshotPath, []byte(snapshot), 0o644); err != nil { t.Fatalf("write snapshot: %v", err) } if err := orchRestore.TestHookRestoreScaledApps(context.Background()); err != nil { t.Fatalf("expected restoreScaledApps success branch, got %v", err) } if _, err := os.Stat(snapshotPath); !os.IsNotExist(err) { t.Fatalf("expected snapshot removal after restore, stat err=%v", err) } }) } // TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T). // Why: covers Startup's API-failure->auto-restore retry path that is otherwise // hard to exercise in deterministic top-level tests. func TestHookGapMatrixPart8LifecycleStartupAutoRestoreBranch(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.AutoEtcdRestoreOnAPIFailure = true cfg.Startup.EtcdRestoreControlPlane = "titan-db" cfg.Startup.RequireNodeInventoryReach = false cfg.Startup.RequireNodeSSHAuth = false cfg.Startup.RequireStorageReady = false cfg.Startup.RequireServiceChecklist = false cfg.Startup.RequireIngressChecklist = false cfg.Startup.RequireCriticalServiceEndpoints = false cfg.Startup.RequireFluxHealth = false cfg.Startup.RequireWorkloadConvergence = false datastoreListener, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("listen datastore preflight stub: %v", err) } defer datastoreListener.Close() apiCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): apiCalls++ if apiCalls == 1 { return "", errors.New("api down") } return "v1.31.0", nil case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return fmt.Sprintf( "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:%d/k3s", datastoreListener.Addr().(*net.TCPAddr).Port, ), nil case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"): return "True", nil case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}"): return cfg.ExpectedFluxSource, nil case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "main", nil case name == "kubectl" && strings.Contains(command, "annotate kustomizations.kustomize.toolkit.fluxcd.io"): return "", nil case name == "kubectl" && strings.Contains(command, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io"): return "", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "part8-auto-restore"}) if err != nil { t.Fatalf("expected startup auto-restore path success, got %v", err) } if apiCalls < 2 { t.Fatalf("expected startup to retry API after auto-restore path, calls=%d", apiCalls) } cfgBadMode := lifecycleConfig(t) orchBadMode, _ := newHookOrchestrator(t, cfgBadMode, nil, nil) err = orchBadMode.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "part8", Mode: "unknown-mode"}) if err == nil || !strings.Contains(err.Error(), "unsupported shutdown mode") { t.Fatalf("expected shutdown unsupported-mode branch, got %v", err) } cfgReport := lifecycleConfig(t) orchReport, _ := newHookOrchestrator(t, cfgReport, nil, nil) if err := orchReport.TestHookWriteStartupReportFile(filepath.Join("", string(os.PathSeparator)), "running"); err == nil { t.Fatalf("expected startup report write failure branch") } if ok := orchReport.TestHookFinalizeStartupReportSnapshot(fmt.Errorf("boom")); !ok { t.Fatalf("expected finalize startup report snapshot non-nil") } }