ananke/testing/orchestrator/hooks_access_failure_matrix_test.go

package orchestrator

import (
	"context"
	"errors"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookNodeReachabilityAndSSHAuthFailureBranches runs one orchestration or CLI step.
// Signature: TestHookNodeReachabilityAndSSHAuthFailureBranches(t *testing.T).
// Why: reaches non-happy-path auth/reachability branches that only appear during real
// drill disruptions so startup gates do not regress silently.
func TestHookNodeReachabilityAndSSHAuthFailureBranches(t *testing.T) {
	t.Run("node-inventory-unmanaged-node-fails-fast", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.RequireNodeInventoryReach = true
		cfg.Startup.NodeInventoryReachWaitSeconds = 1
		cfg.Startup.NodeInventoryReachPollSeconds = 1
		cfg.SSHManagedNodes = []string{"titan-db"}

		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.TestHookWaitForNodeInventoryReachability(context.Background())
		if err == nil || !strings.Contains(err.Error(), "not in ssh_managed_nodes") {
			t.Fatalf("expected unmanaged-node reachability failure, got %v", err)
		}
	})

	t.Run("node-inventory-timeout-and-context-cancel", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.RequireNodeInventoryReach = true
		cfg.Startup.NodeInventoryReachWaitSeconds = 1
		cfg.Startup.NodeInventoryReachPollSeconds = 1
		cfg.SSHManagedNodes = []string{"titan-db", "titan-23"}

		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") && strings.Contains(command, "titan-23") {
				return "unexpected", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.TestHookWaitForNodeInventoryReachability(context.Background())
		if err == nil || !strings.Contains(err.Error(), "did not pass within") {
			t.Fatalf("expected reachability timeout, got %v", err)
		}

		cfg.Startup.NodeInventoryReachWaitSeconds = 30
		orchCanceled, _ := newHookOrchestrator(t, cfg, run, run)
		cancelCtx, cancel := context.WithCancel(context.Background())
		cancel()
		err = orchCanceled.TestHookWaitForNodeInventoryReachability(cancelCtx)
		if !errors.Is(err, context.Canceled) {
			t.Fatalf("expected context canceled from reachability gate, got %v", err)
		}
	})

	t.Run("node-ssh-auth-denied-timeout-and-context-cancel", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.RequireNodeSSHAuth = true
		cfg.Startup.NodeSSHAuthWaitSeconds = 1
		cfg.Startup.NodeSSHAuthPollSeconds = 1
		cfg.SSHManagedNodes = []string{"titan-db", "titan-23"}

		deniedRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") {
				return "", fmt.Errorf("Permission denied (publickey)")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchDenied, _ := newHookOrchestrator(t, cfg, deniedRun, deniedRun)
		err := orchDenied.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"})
		if err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") {
			t.Fatalf("expected ssh auth denied failure, got %v", err)
		}

		timeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") {
				return "unexpected", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchTimeout, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun)
		err = orchTimeout.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"})
		if err == nil || !strings.Contains(err.Error(), "did not pass within") {
			t.Fatalf("expected ssh auth timeout, got %v", err)
		}

		cfg.Startup.NodeSSHAuthWaitSeconds = 30
		orchCanceled, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun)
		cancelCtx, cancel := context.WithCancel(context.Background())
		cancel()
		err = orchCanceled.TestHookWaitForNodeSSHAuth(cancelCtx, []string{"titan-db", "titan-23"})
		if !errors.Is(err, context.Canceled) {
			t.Fatalf("expected context canceled from ssh-auth gate, got %v", err)
		}
	})
}

// TestHookAccessAndFluxSourceFailureBranches runs one orchestration or CLI step.
// Signature: TestHookAccessAndFluxSourceFailureBranches(t *testing.T).
// Why: validates drift/branch/repo failure branches so startup catches source deadlocks
// before the cluster is declared recovered.
func TestHookAccessAndFluxSourceFailureBranches(t *testing.T) {
	t.Run("reconcile-node-access-aggregates-errors", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.SSHManagedNodes = []string{"titan-db", "titan-23"}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") {
				return "", fmt.Errorf("sudo blocked")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"})
		if err == nil || !strings.Contains(err.Error(), "access validation had") {
			t.Fatalf("expected access validation aggregation error, got %v", err)
		}
	})

	t.Run("guard-and-ensure-flux-source-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)

		notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") {
				return "", fmt.Errorf("Error from server (NotFound): gitrepositories.source.toolkit.fluxcd.io \"flux-system\" not found")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchNotFound, _ := newHookOrchestrator(t, cfg, notFoundRun, notFoundRun)
		if err := orchNotFound.TestHookGuardFluxSourceDrift(context.Background(), "main", false); err != nil {
			t.Fatalf("expected not-found branch to be tolerated, got %v", err)
		}

		readErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") {
				return "", fmt.Errorf("boom")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchReadErr, _ := newHookOrchestrator(t, cfg, readErrRun, readErrRun)
		if err := orchReadErr.TestHookGuardFluxSourceDrift(context.Background(), "main", false); err == nil {
			t.Fatalf("expected flux source read error")
		}

		noPatchRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") {
				return "feature/sso", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchNoPatch, _ := newHookOrchestrator(t, cfg, noPatchRun, noPatchRun)
		if err := orchNoPatch.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil {
			t.Fatalf("expected branch mismatch without patch permission")
		}

		patchErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"):
				return "feature/sso", nil
			case name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system"):
				return "", fmt.Errorf("patch failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orchPatchErr, _ := newHookOrchestrator(t, cfg, patchErrRun, patchErrRun)
		if err := orchPatchErr.TestHookEnsureFluxBranch(context.Background(), "main", true); err == nil {
			t.Fatalf("expected patch failure branch")
		}

		patchOKRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") {
				return "feature/sso", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchPatchOK, _ := newHookOrchestrator(t, cfg, patchOKRun, patchOKRun)
		if err := orchPatchOK.TestHookEnsureFluxBranch(context.Background(), "main", true); err != nil {
			t.Fatalf("expected branch patch success, got %v", err)
		}
	})

	t.Run("wait-for-flux-source-ready-error-timeout-and-cancel", func(t *testing.T) {
		cfg := lifecycleConfig(t)

		errorRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") {
				return "", fmt.Errorf("query failed")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchErr, _ := newHookOrchestrator(t, cfg, errorRun, errorRun)
		if _, err := orchErr.TestHookWaitForFluxSourceReady(context.Background(), time.Second); err == nil {
			t.Fatalf("expected readiness query error")
		}

		timeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") {
				return "", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orchTimeout, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun)
		ready, err := orchTimeout.TestHookWaitForFluxSourceReady(context.Background(), time.Millisecond)
		if err != nil || ready {
			t.Fatalf("expected timeout branch (ready=false, err=nil), got ready=%v err=%v", ready, err)
		}

		cancelCtx, cancel := context.WithCancel(context.Background())
		cancel()
		_, err = orchTimeout.TestHookWaitForFluxSourceReady(cancelCtx, 30*time.Second)
		if !errors.Is(err, context.Canceled) {
			t.Fatalf("expected context canceled, got %v", err)
		}
	})
}

// TestHookBootstrapCacheAndRepoSyncFailureBranches runs one orchestration or CLI step.
// Signature: TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T).
// Why: covers local bootstrap/repo edge paths so bootstrap fallback behavior stays deterministic.
func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
	t.Run("bootstrap-local-all-paths-fail", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.IACRepoPath = t.TempDir()
		cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
		if err := os.MkdirAll(filepath.Join(cfg.IACRepoPath, "services", "bootstrap"), 0o755); err != nil {
			t.Fatalf("mkdir bootstrap path: %v", err)
		}

		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, " apply -k "):
				return "", fmt.Errorf("apply -k failed")
			case name == "kubectl" && strings.Contains(command, " apply -f "):
				return "", fmt.Errorf("cache apply failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "sh" && strings.Contains(command, "kubectl kustomize") {
				return "", fmt.Errorf("kustomize render failed")
			}
			return run(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, runSensitive)
		err := orch.TestHookBootstrapLocal(context.Background())
		if err == nil || !strings.Contains(err.Error(), "failed for every configured path") {
			t.Fatalf("expected bootstrap all-failed error, got %v", err)
		}
	})

	t.Run("sync-local-iac-repo-branches", func(t *testing.T) {
		baseCfg := lifecycleConfig(t)

		emptyCfg := baseCfg
		emptyCfg.IACRepoPath = ""
		orchEmpty, _ := newHookOrchestrator(t, emptyCfg, nil, nil)
		if err := orchEmpty.TestHookSyncLocalIACRepo(context.Background()); err == nil {
			t.Fatalf("expected empty repo path error")
		}

		notGitCfg := baseCfg
		notGitCfg.IACRepoPath = t.TempDir()
		orchNotGit, _ := newHookOrchestrator(t, notGitCfg, nil, nil)
		if err := orchNotGit.TestHookSyncLocalIACRepo(context.Background()); err == nil {
			t.Fatalf("expected non-git checkout error")
		}

		repo := t.TempDir()
		if err := os.MkdirAll(filepath.Join(repo, ".git"), 0o755); err != nil {
			t.Fatalf("mkdir .git: %v", err)
		}
		dirtyCfg := baseCfg
		dirtyCfg.IACRepoPath = repo
		runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "git" && strings.Contains(command, "status --porcelain"):
				return " M README.md\n", nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orchDirty, _ := newHookOrchestrator(t, dirtyCfg, nil, runSensitive)
		if err := orchDirty.TestHookSyncLocalIACRepo(context.Background()); err != nil {
			t.Fatalf("dirty working-tree branch should skip sync, got %v", err)
		}

		fetchErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "git" && strings.Contains(command, "status --porcelain"):
				return "", nil
			case name == "git" && strings.Contains(command, "fetch origin --prune"):
				return "", fmt.Errorf("fetch failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orchFetchErr, _ := newHookOrchestrator(t, dirtyCfg, nil, fetchErrRun)
		if err := orchFetchErr.TestHookSyncLocalIACRepo(context.Background()); err == nil {
			t.Fatalf("expected git fetch failure")
		}
	})

	t.Run("refresh-and-apply-bootstrap-cache-branches", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.IACRepoPath = t.TempDir()
		cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
		if err := os.MkdirAll(filepath.Join(cfg.IACRepoPath, "services", "bootstrap"), 0o755); err != nil {
			t.Fatalf("mkdir bootstrap path: %v", err)
		}

		runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "sh" && strings.Contains(command, "kubectl kustomize") {
				return "", fmt.Errorf("render failed")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, nil, runSensitive)
		if err := orch.TestHookRefreshBootstrapCache(context.Background()); err == nil {
			t.Fatalf("expected refresh bootstrap cache failure when no renders succeed")
		}
		if err := orch.TestHookApplyBootstrapCache(context.Background(), "services/bootstrap"); err == nil {
			t.Fatalf("expected apply bootstrap cache missing-file failure")
		}
	})

	t.Run("wait-for-flux-source-ready-dry-run", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		if err := os.MkdirAll(cfg.State.Dir, 0o755); err != nil {
			t.Fatalf("ensure state dir: %v", err)
		}
		orch := cluster.New(cfg, &execx.Runner{DryRun: true}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
		if ready, err := orch.TestHookWaitForFluxSourceReady(context.Background(), time.Second); err != nil || !ready {
			t.Fatalf("expected dry-run readiness fast-path, got ready=%v err=%v", ready, err)
		}
	})
}