package orchestrator import ( "context" "errors" "fmt" "io" "log" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestHookNodeReachabilityAndSSHAuthFailureBranches runs one orchestration or CLI step. // Signature: TestHookNodeReachabilityAndSSHAuthFailureBranches(t *testing.T). // Why: reaches non-happy-path auth/reachability branches that only appear during real // drill disruptions so startup gates do not regress silently. func TestHookNodeReachabilityAndSSHAuthFailureBranches(t *testing.T) { t.Run("node-inventory-unmanaged-node-fails-fast", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireNodeInventoryReach = true cfg.Startup.NodeInventoryReachWaitSeconds = 1 cfg.Startup.NodeInventoryReachPollSeconds = 1 cfg.SSHManagedNodes = []string{"titan-db"} orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.TestHookWaitForNodeInventoryReachability(context.Background()) if err == nil || !strings.Contains(err.Error(), "not in ssh_managed_nodes") { t.Fatalf("expected unmanaged-node reachability failure, got %v", err) } }) t.Run("node-inventory-timeout-and-context-cancel", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireNodeInventoryReach = true cfg.Startup.NodeInventoryReachWaitSeconds = 1 cfg.Startup.NodeInventoryReachPollSeconds = 1 cfg.SSHManagedNodes = []string{"titan-db", "titan-23"} run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") && strings.Contains(command, "titan-23") { return "unexpected", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.TestHookWaitForNodeInventoryReachability(context.Background()) if err == nil || !strings.Contains(err.Error(), "did not pass within") { t.Fatalf("expected reachability timeout, got %v", err) } cfg.Startup.NodeInventoryReachWaitSeconds = 30 orchCanceled, _ := newHookOrchestrator(t, cfg, run, run) cancelCtx, cancel := context.WithCancel(context.Background()) cancel() err = orchCanceled.TestHookWaitForNodeInventoryReachability(cancelCtx) if !errors.Is(err, context.Canceled) { t.Fatalf("expected context canceled from reachability gate, got %v", err) } }) t.Run("node-ssh-auth-denied-timeout-and-context-cancel", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireNodeSSHAuth = true cfg.Startup.NodeSSHAuthWaitSeconds = 1 cfg.Startup.NodeSSHAuthPollSeconds = 1 cfg.SSHManagedNodes = []string{"titan-db", "titan-23"} deniedRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") { return "", fmt.Errorf("Permission denied (publickey)") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchDenied, _ := newHookOrchestrator(t, cfg, deniedRun, deniedRun) err := orchDenied.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}) if err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") { t.Fatalf("expected ssh auth denied failure, got %v", err) } timeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") { return "unexpected", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchTimeout, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun) err = orchTimeout.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}) if err == nil || !strings.Contains(err.Error(), "did not pass within") { t.Fatalf("expected ssh auth timeout, got %v", err) } cfg.Startup.NodeSSHAuthWaitSeconds = 30 orchCanceled, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun) cancelCtx, cancel := context.WithCancel(context.Background()) cancel() err = orchCanceled.TestHookWaitForNodeSSHAuth(cancelCtx, []string{"titan-db", "titan-23"}) if !errors.Is(err, context.Canceled) { t.Fatalf("expected context canceled from ssh-auth gate, got %v", err) } }) } // TestHookAccessAndFluxSourceFailureBranches runs one orchestration or CLI step. // Signature: TestHookAccessAndFluxSourceFailureBranches(t *testing.T). // Why: validates drift/branch/repo failure branches so startup catches source deadlocks // before the cluster is declared recovered. func TestHookAccessAndFluxSourceFailureBranches(t *testing.T) { t.Run("reconcile-node-access-aggregates-errors", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.SSHManagedNodes = []string{"titan-db", "titan-23"} run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") { return "", fmt.Errorf("sudo blocked") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}) if err == nil || !strings.Contains(err.Error(), "access validation had") { t.Fatalf("expected access validation aggregation error, got %v", err) } }) t.Run("guard-and-ensure-flux-source-branches", func(t *testing.T) { cfg := lifecycleConfig(t) notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") { return "", fmt.Errorf("Error from server (NotFound): gitrepositories.source.toolkit.fluxcd.io \"flux-system\" not found") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchNotFound, _ := newHookOrchestrator(t, cfg, notFoundRun, notFoundRun) if err := orchNotFound.TestHookGuardFluxSourceDrift(context.Background(), "main", false); err != nil { t.Fatalf("expected not-found branch to be tolerated, got %v", err) } readErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") { return "", fmt.Errorf("boom") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchReadErr, _ := newHookOrchestrator(t, cfg, readErrRun, readErrRun) if err := orchReadErr.TestHookGuardFluxSourceDrift(context.Background(), "main", false); err == nil { t.Fatalf("expected flux source read error") } noPatchRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") { return "feature/sso", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchNoPatch, _ := newHookOrchestrator(t, cfg, noPatchRun, noPatchRun) if err := orchNoPatch.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil { t.Fatalf("expected branch mismatch without patch permission") } patchErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "feature/sso", nil case name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system"): return "", fmt.Errorf("patch failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchPatchErr, _ := newHookOrchestrator(t, cfg, patchErrRun, patchErrRun) if err := orchPatchErr.TestHookEnsureFluxBranch(context.Background(), "main", true); err == nil { t.Fatalf("expected patch failure branch") } patchOKRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") { return "feature/sso", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchPatchOK, _ := newHookOrchestrator(t, cfg, patchOKRun, patchOKRun) if err := orchPatchOK.TestHookEnsureFluxBranch(context.Background(), "main", true); err != nil { t.Fatalf("expected branch patch success, got %v", err) } }) t.Run("wait-for-flux-source-ready-error-timeout-and-cancel", func(t *testing.T) { cfg := lifecycleConfig(t) errorRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") { return "", fmt.Errorf("query failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchErr, _ := newHookOrchestrator(t, cfg, errorRun, errorRun) if _, err := orchErr.TestHookWaitForFluxSourceReady(context.Background(), time.Second); err == nil { t.Fatalf("expected readiness query error") } timeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") { return "", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchTimeout, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun) ready, err := orchTimeout.TestHookWaitForFluxSourceReady(context.Background(), time.Millisecond) if err != nil || ready { t.Fatalf("expected timeout branch (ready=false, err=nil), got ready=%v err=%v", ready, err) } cancelCtx, cancel := context.WithCancel(context.Background()) cancel() _, err = orchTimeout.TestHookWaitForFluxSourceReady(cancelCtx, 30*time.Second) if !errors.Is(err, context.Canceled) { t.Fatalf("expected context canceled, got %v", err) } }) } // TestHookBootstrapCacheAndRepoSyncFailureBranches runs one orchestration or CLI step. // Signature: TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T). // Why: covers local bootstrap/repo edge paths so bootstrap fallback behavior stays deterministic. func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) { t.Run("bootstrap-local-all-paths-fail", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.IACRepoPath = t.TempDir() cfg.LocalBootstrapPaths = []string{"services/bootstrap"} if err := os.MkdirAll(filepath.Join(cfg.IACRepoPath, "services", "bootstrap"), 0o755); err != nil { t.Fatalf("mkdir bootstrap path: %v", err) } run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, " apply -k "): return "", fmt.Errorf("apply -k failed") case name == "kubectl" && strings.Contains(command, " apply -f "): return "", fmt.Errorf("cache apply failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "sh" && strings.Contains(command, "kubectl kustomize") { return "", fmt.Errorf("kustomize render failed") } return run(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, runSensitive) err := orch.TestHookBootstrapLocal(context.Background()) if err == nil || !strings.Contains(err.Error(), "failed for every configured path") { t.Fatalf("expected bootstrap all-failed error, got %v", err) } }) t.Run("sync-local-iac-repo-branches", func(t *testing.T) { baseCfg := lifecycleConfig(t) emptyCfg := baseCfg emptyCfg.IACRepoPath = "" orchEmpty, _ := newHookOrchestrator(t, emptyCfg, nil, nil) if err := orchEmpty.TestHookSyncLocalIACRepo(context.Background()); err == nil { t.Fatalf("expected empty repo path error") } notGitCfg := baseCfg notGitCfg.IACRepoPath = t.TempDir() orchNotGit, _ := newHookOrchestrator(t, notGitCfg, nil, nil) if err := orchNotGit.TestHookSyncLocalIACRepo(context.Background()); err == nil { t.Fatalf("expected non-git checkout error") } repo := t.TempDir() if err := os.MkdirAll(filepath.Join(repo, ".git"), 0o755); err != nil { t.Fatalf("mkdir .git: %v", err) } dirtyCfg := baseCfg dirtyCfg.IACRepoPath = repo runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "git" && strings.Contains(command, "status --porcelain"): return " M README.md\n", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchDirty, _ := newHookOrchestrator(t, dirtyCfg, nil, runSensitive) if err := orchDirty.TestHookSyncLocalIACRepo(context.Background()); err != nil { t.Fatalf("dirty working-tree branch should skip sync, got %v", err) } fetchErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "git" && strings.Contains(command, "status --porcelain"): return "", nil case name == "git" && strings.Contains(command, "fetch origin --prune"): return "", fmt.Errorf("fetch failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchFetchErr, _ := newHookOrchestrator(t, dirtyCfg, nil, fetchErrRun) if err := orchFetchErr.TestHookSyncLocalIACRepo(context.Background()); err == nil { t.Fatalf("expected git fetch failure") } }) t.Run("refresh-and-apply-bootstrap-cache-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.IACRepoPath = t.TempDir() cfg.LocalBootstrapPaths = []string{"services/bootstrap"} if err := os.MkdirAll(filepath.Join(cfg.IACRepoPath, "services", "bootstrap"), 0o755); err != nil { t.Fatalf("mkdir bootstrap path: %v", err) } runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "sh" && strings.Contains(command, "kubectl kustomize") { return "", fmt.Errorf("render failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, nil, runSensitive) if err := orch.TestHookRefreshBootstrapCache(context.Background()); err == nil { t.Fatalf("expected refresh bootstrap cache failure when no renders succeed") } if err := orch.TestHookApplyBootstrapCache(context.Background(), "services/bootstrap"); err == nil { t.Fatalf("expected apply bootstrap cache missing-file failure") } }) t.Run("wait-for-flux-source-ready-dry-run", func(t *testing.T) { cfg := lifecycleConfig(t) if err := os.MkdirAll(cfg.State.Dir, 0o755); err != nil { t.Fatalf("ensure state dir: %v", err) } orch := cluster.New(cfg, &execx.Runner{DryRun: true}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) if ready, err := orch.TestHookWaitForFluxSourceReady(context.Background(), time.Second); err != nil || !ready { t.Fatalf("expected dry-run readiness fast-path, got ready=%v err=%v", ready, err) } }) }