ananke/testing/orchestrator/hooks_access_failure_matrix_test.go

366 lines
16 KiB
Go

package orchestrator
import (
"context"
"errors"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookNodeReachabilityAndSSHAuthFailureBranches runs one orchestration or CLI step.
// Signature: TestHookNodeReachabilityAndSSHAuthFailureBranches(t *testing.T).
// Why: reaches non-happy-path auth/reachability branches that only appear during real
// drill disruptions so startup gates do not regress silently.
func TestHookNodeReachabilityAndSSHAuthFailureBranches(t *testing.T) {
t.Run("node-inventory-unmanaged-node-fails-fast", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeInventoryReach = true
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
cfg.SSHManagedNodes = []string{"titan-db"}
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.TestHookWaitForNodeInventoryReachability(context.Background())
if err == nil || !strings.Contains(err.Error(), "not in ssh_managed_nodes") {
t.Fatalf("expected unmanaged-node reachability failure, got %v", err)
}
})
t.Run("node-inventory-timeout-and-context-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeInventoryReach = true
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
cfg.SSHManagedNodes = []string{"titan-db", "titan-23"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") && strings.Contains(command, "titan-23") {
return "unexpected", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookWaitForNodeInventoryReachability(context.Background())
if err == nil || !strings.Contains(err.Error(), "did not pass within") {
t.Fatalf("expected reachability timeout, got %v", err)
}
cfg.Startup.NodeInventoryReachWaitSeconds = 30
orchCanceled, _ := newHookOrchestrator(t, cfg, run, run)
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
err = orchCanceled.TestHookWaitForNodeInventoryReachability(cancelCtx)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context canceled from reachability gate, got %v", err)
}
})
t.Run("node-ssh-auth-denied-timeout-and-context-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
cfg.Startup.NodeSSHAuthWaitSeconds = 1
cfg.Startup.NodeSSHAuthPollSeconds = 1
cfg.SSHManagedNodes = []string{"titan-db", "titan-23"}
deniedRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") {
return "", fmt.Errorf("Permission denied (publickey)")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchDenied, _ := newHookOrchestrator(t, cfg, deniedRun, deniedRun)
err := orchDenied.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"})
if err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") {
t.Fatalf("expected ssh auth denied failure, got %v", err)
}
timeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") {
return "unexpected", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchTimeout, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun)
err = orchTimeout.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"})
if err == nil || !strings.Contains(err.Error(), "did not pass within") {
t.Fatalf("expected ssh auth timeout, got %v", err)
}
cfg.Startup.NodeSSHAuthWaitSeconds = 30
orchCanceled, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun)
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
err = orchCanceled.TestHookWaitForNodeSSHAuth(cancelCtx, []string{"titan-db", "titan-23"})
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context canceled from ssh-auth gate, got %v", err)
}
})
}
// TestHookAccessAndFluxSourceFailureBranches runs one orchestration or CLI step.
// Signature: TestHookAccessAndFluxSourceFailureBranches(t *testing.T).
// Why: validates drift/branch/repo failure branches so startup catches source deadlocks
// before the cluster is declared recovered.
func TestHookAccessAndFluxSourceFailureBranches(t *testing.T) {
t.Run("reconcile-node-access-aggregates-errors", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.SSHManagedNodes = []string{"titan-db", "titan-23"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") {
return "", fmt.Errorf("sudo blocked")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"})
if err == nil || !strings.Contains(err.Error(), "access validation had") {
t.Fatalf("expected access validation aggregation error, got %v", err)
}
})
t.Run("guard-and-ensure-flux-source-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") {
return "", fmt.Errorf("Error from server (NotFound): gitrepositories.source.toolkit.fluxcd.io \"flux-system\" not found")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchNotFound, _ := newHookOrchestrator(t, cfg, notFoundRun, notFoundRun)
if err := orchNotFound.TestHookGuardFluxSourceDrift(context.Background(), "main", false); err != nil {
t.Fatalf("expected not-found branch to be tolerated, got %v", err)
}
readErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") {
return "", fmt.Errorf("boom")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchReadErr, _ := newHookOrchestrator(t, cfg, readErrRun, readErrRun)
if err := orchReadErr.TestHookGuardFluxSourceDrift(context.Background(), "main", false); err == nil {
t.Fatalf("expected flux source read error")
}
noPatchRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") {
return "feature/sso", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchNoPatch, _ := newHookOrchestrator(t, cfg, noPatchRun, noPatchRun)
if err := orchNoPatch.TestHookEnsureFluxBranch(context.Background(), "main", false); err == nil {
t.Fatalf("expected branch mismatch without patch permission")
}
patchErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"):
return "feature/sso", nil
case name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system"):
return "", fmt.Errorf("patch failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchPatchErr, _ := newHookOrchestrator(t, cfg, patchErrRun, patchErrRun)
if err := orchPatchErr.TestHookEnsureFluxBranch(context.Background(), "main", true); err == nil {
t.Fatalf("expected patch failure branch")
}
patchOKRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") {
return "feature/sso", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchPatchOK, _ := newHookOrchestrator(t, cfg, patchOKRun, patchOKRun)
if err := orchPatchOK.TestHookEnsureFluxBranch(context.Background(), "main", true); err != nil {
t.Fatalf("expected branch patch success, got %v", err)
}
})
t.Run("wait-for-flux-source-ready-error-timeout-and-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)
errorRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") {
return "", fmt.Errorf("query failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestrator(t, cfg, errorRun, errorRun)
if _, err := orchErr.TestHookWaitForFluxSourceReady(context.Background(), time.Second); err == nil {
t.Fatalf("expected readiness query error")
}
timeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") {
return "", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchTimeout, _ := newHookOrchestrator(t, cfg, timeoutRun, timeoutRun)
ready, err := orchTimeout.TestHookWaitForFluxSourceReady(context.Background(), time.Millisecond)
if err != nil || ready {
t.Fatalf("expected timeout branch (ready=false, err=nil), got ready=%v err=%v", ready, err)
}
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
_, err = orchTimeout.TestHookWaitForFluxSourceReady(cancelCtx, 30*time.Second)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context canceled, got %v", err)
}
})
}
// TestHookBootstrapCacheAndRepoSyncFailureBranches runs one orchestration or CLI step.
// Signature: TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T).
// Why: covers local bootstrap/repo edge paths so bootstrap fallback behavior stays deterministic.
func TestHookBootstrapCacheAndRepoSyncFailureBranches(t *testing.T) {
t.Run("bootstrap-local-all-paths-fail", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.IACRepoPath = t.TempDir()
cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
if err := os.MkdirAll(filepath.Join(cfg.IACRepoPath, "services", "bootstrap"), 0o755); err != nil {
t.Fatalf("mkdir bootstrap path: %v", err)
}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, " apply -k "):
return "", fmt.Errorf("apply -k failed")
case name == "kubectl" && strings.Contains(command, " apply -f "):
return "", fmt.Errorf("cache apply failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "sh" && strings.Contains(command, "kubectl kustomize") {
return "", fmt.Errorf("kustomize render failed")
}
return run(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, runSensitive)
err := orch.TestHookBootstrapLocal(context.Background())
if err == nil || !strings.Contains(err.Error(), "failed for every configured path") {
t.Fatalf("expected bootstrap all-failed error, got %v", err)
}
})
t.Run("sync-local-iac-repo-branches", func(t *testing.T) {
baseCfg := lifecycleConfig(t)
emptyCfg := baseCfg
emptyCfg.IACRepoPath = ""
orchEmpty, _ := newHookOrchestrator(t, emptyCfg, nil, nil)
if err := orchEmpty.TestHookSyncLocalIACRepo(context.Background()); err == nil {
t.Fatalf("expected empty repo path error")
}
notGitCfg := baseCfg
notGitCfg.IACRepoPath = t.TempDir()
orchNotGit, _ := newHookOrchestrator(t, notGitCfg, nil, nil)
if err := orchNotGit.TestHookSyncLocalIACRepo(context.Background()); err == nil {
t.Fatalf("expected non-git checkout error")
}
repo := t.TempDir()
if err := os.MkdirAll(filepath.Join(repo, ".git"), 0o755); err != nil {
t.Fatalf("mkdir .git: %v", err)
}
dirtyCfg := baseCfg
dirtyCfg.IACRepoPath = repo
runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "git" && strings.Contains(command, "status --porcelain"):
return " M README.md\n", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchDirty, _ := newHookOrchestrator(t, dirtyCfg, nil, runSensitive)
if err := orchDirty.TestHookSyncLocalIACRepo(context.Background()); err != nil {
t.Fatalf("dirty working-tree branch should skip sync, got %v", err)
}
fetchErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "git" && strings.Contains(command, "status --porcelain"):
return "", nil
case name == "git" && strings.Contains(command, "fetch origin --prune"):
return "", fmt.Errorf("fetch failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchFetchErr, _ := newHookOrchestrator(t, dirtyCfg, nil, fetchErrRun)
if err := orchFetchErr.TestHookSyncLocalIACRepo(context.Background()); err == nil {
t.Fatalf("expected git fetch failure")
}
})
t.Run("refresh-and-apply-bootstrap-cache-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.IACRepoPath = t.TempDir()
cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
if err := os.MkdirAll(filepath.Join(cfg.IACRepoPath, "services", "bootstrap"), 0o755); err != nil {
t.Fatalf("mkdir bootstrap path: %v", err)
}
runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "sh" && strings.Contains(command, "kubectl kustomize") {
return "", fmt.Errorf("render failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, nil, runSensitive)
if err := orch.TestHookRefreshBootstrapCache(context.Background()); err == nil {
t.Fatalf("expected refresh bootstrap cache failure when no renders succeed")
}
if err := orch.TestHookApplyBootstrapCache(context.Background(), "services/bootstrap"); err == nil {
t.Fatalf("expected apply bootstrap cache missing-file failure")
}
})
t.Run("wait-for-flux-source-ready-dry-run", func(t *testing.T) {
cfg := lifecycleConfig(t)
if err := os.MkdirAll(cfg.State.Dir, 0o755); err != nil {
t.Fatalf("ensure state dir: %v", err)
}
orch := cluster.New(cfg, &execx.Runner{DryRun: true}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
if ready, err := orch.TestHookWaitForFluxSourceReady(context.Background(), time.Second); err != nil || !ready {
t.Fatalf("expected dry-run readiness fast-path, got ready=%v err=%v", ready, err)
}
})
}