package orchestrator import ( "context" "errors" "net" "os" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/state" ) // lifecycleFastConfig runs one orchestration or CLI step. // Signature: lifecycleFastConfig(t *testing.T) config.Config. // Why: lifecycle gap tests intentionally fail early in many branches, so short // wait windows keep branch coverage runs fast and deterministic. func lifecycleFastConfig(t *testing.T) config.Config { t.Helper() cfg := lifecycleConfig(t) cfg.Startup.APIWaitSeconds = 1 cfg.Startup.APIPollSeconds = 1 cfg.Startup.NodeInventoryReachWaitSeconds = 1 cfg.Startup.NodeInventoryReachPollSeconds = 1 cfg.Startup.RequireCriticalServiceEndpoints = false cfg.Startup.RequireFluxHealth = false cfg.Startup.RequireWorkloadConvergence = false cfg.Startup.RequireIngressChecklist = false cfg.Startup.RequireServiceChecklist = false cfg.Startup.ServiceChecklistStabilitySec = 0 cfg.Startup.RequirePostStartProbes = false return cfg } // TestLifecycleDeepFailureMatrix runs one orchestration or CLI step. // Signature: TestLifecycleDeepFailureMatrix(t *testing.T). // Why: saturates remaining lifecycle startup/shutdown edge branches that are // difficult to hit from happy-path drill tests. func TestLifecycleDeepFailureMatrix(t *testing.T) { t.Run("startup-lock-path-is-directory", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.State.LockPath = t.TempDir() orch, _ := newHookOrchestrator(t, cfg, nil, nil) if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lock-dir"}); err == nil { t.Fatalf("expected lock-path directory failure") } }) t.Run("startup-node-inventory-validation-fails", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.SSHPort = 70000 orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bad-inventory"}) if err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") { t.Fatalf("expected node inventory preflight failure, got %v", err) } }) t.Run("startup-node-reachability-fails", func(t *testing.T) { cfg := lifecycleFastConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") { return "", errors.New("no route to host") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "reachability-fail"}) if err == nil || !strings.Contains(err.Error(), "node inventory reachability gate") { t.Fatalf("expected reachability gate failure, got %v", err) } }) t.Run("startup-clear-stale-startup-intent-write-fails", func(t *testing.T) { cfg := lifecycleFastConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentStartupInProgress, Reason: "stale", Source: "test", UpdatedAt: time.Now().UTC().Add(-2 * time.Hour), }); err != nil { t.Fatalf("seed stale startup intent: %v", err) } restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") { return errors.New("forced intent clear failure") } return state.TestHookWriteIntentDefault(path, in) }) t.Cleanup(restoreWrite) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"}) if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") { t.Fatalf("expected stale startup clear failure, got %v", err) } }) t.Run("startup-clear-stale-shutdown-intent-write-fails", func(t *testing.T) { cfg := lifecycleFastConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShuttingDown, Reason: "stale-shutdown", Source: "test", UpdatedAt: time.Now().UTC().Add(-2 * time.Hour), }); err != nil { t.Fatalf("seed stale shutdown intent: %v", err) } restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") { return errors.New("forced intent clear failure") } return state.TestHookWriteIntentDefault(path, in) }) t.Cleanup(restoreWrite) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"}) if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") { t.Fatalf("expected stale shutdown clear failure, got %v", err) } }) t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.ShutdownCooldownSeconds = 1 if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShutdownComplete, Reason: "recent", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("seed cooldown intent: %v", err) } go func(intentPath string) { time.Sleep(150 * time.Millisecond) _ = os.Remove(intentPath) _ = os.Mkdir(intentPath, 0o755) }(cfg.State.IntentPath) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"}) if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") { t.Fatalf("expected cooldown reread failure, got %v", err) } }) t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.ShutdownCooldownSeconds = 1 if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShutdownComplete, Reason: "recent", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("seed cooldown intent: %v", err) } go func(intentPath string) { time.Sleep(150 * time.Millisecond) _ = state.WriteIntent(intentPath, state.Intent{ State: state.IntentShuttingDown, Reason: "peer-shutdown", Source: "test", UpdatedAt: time.Now().UTC(), }) }(cfg.State.IntentPath) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"}) if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") { t.Fatalf("expected cooldown active-shutdown failure, got %v", err) } }) t.Run("startup-set-intent-write-fails", func(t *testing.T) { cfg := lifecycleFastConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentNormal, Reason: "seed", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("seed normal intent: %v", err) } restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error { if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress { return errors.New("forced startup intent write failure") } return state.TestHookWriteIntentDefault(path, in) }) t.Cleanup(restoreWrite) orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"}) if err == nil || !strings.Contains(err.Error(), "set startup intent") { t.Fatalf("expected startup intent write failure, got %v", err) } }) t.Run("startup-timesync-error-propagates", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.RequireTimeSync = true cfg.Startup.TimeSyncWaitSeconds = 1 cfg.Startup.TimeSyncPollSeconds = 1 orch, _ := newHookOrchestrator(t, cfg, nil, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-fail"}) if err == nil || !strings.Contains(err.Error(), "time sync") { t.Fatalf("expected time sync failure, got %v", err) } }) t.Run("startup-datastore-preflight-cancel", func(t *testing.T) { cfg := lifecycleFastConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "systemctl cat k3s") { return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) ctx, cancel := context.WithCancel(context.Background()) cancel() err := orch.Startup(ctx, cluster.StartupOptions{Reason: "datastore-cancel"}) if !errors.Is(err, context.Canceled) { t.Fatalf("expected canceled datastore preflight, got %v", err) } }) t.Run("startup-auto-etcd-restore-hard-failure", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.AutoEtcdRestoreOnAPIFailure = true cfg.Startup.EtcdRestoreControlPlane = "titan-db" run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): return "", errors.New("api down") case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server", nil case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"): return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil case name == "ssh" && strings.Contains(command, "stat -c %s"): return "2097152", nil case name == "ssh" && strings.Contains(command, "sha256sum"): return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil case name == "ssh" && strings.Contains(command, "server --cluster-reset"): return "", errors.New("cluster reset failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-hard-fail"}) if err == nil || !strings.Contains(err.Error(), "automatic etcd restore failed") { t.Fatalf("expected automatic etcd restore failure, got %v", err) } }) t.Run("startup-auto-etcd-restore-not-applicable-then-api-still-down", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.AutoEtcdRestoreOnAPIFailure = true cfg.Startup.EtcdRestoreControlPlane = "titan-db" listener, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("open local datastore listener: %v", err) } defer listener.Close() address := listener.Addr().String() run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): return "", errors.New("api still down") case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://" + address + "/k3s", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-not-applicable"}) if err == nil || !strings.Contains(err.Error(), "after automatic etcd restore") { t.Fatalf("expected api failure after not-applicable restore path, got %v", err) } }) t.Run("startup-required-node-labels-error", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.RequiredNodeLabels = map[string]map[string]string{ "titan-db": {"topology.kubernetes.io/zone": "lab-a"}, } run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "label node titan-db --overwrite") { return "", errors.New("label denied") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "label-fail"}) if err == nil || !strings.Contains(err.Error(), "ensure required node labels") { t.Fatalf("expected required-node-label failure, got %v", err) } }) t.Run("startup-worker-discovery-error", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Workers = nil cfg.SSHManagedNodes = []string{"titan-db"} cfg.SSHNodeHosts = map[string]string{ "titan-db": "titan-db", } run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" { return "", errors.New("nodes denied") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "worker-discovery-fail"}) if err == nil || !strings.Contains(err.Error(), "discover workers") { t.Fatalf("expected worker-discovery failure, got %v", err) } }) t.Run("startup-storage-readiness-error", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.RequireStorageReady = true cfg.Startup.StorageReadyWaitSeconds = 1 cfg.Startup.StorageReadyPollSeconds = 1 cfg.Startup.StorageMinReadyNodes = 1 cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"} run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") { return "a:True:True\n", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "storage-fail"}) if err == nil || !strings.Contains(err.Error(), "invalid storage_critical_pvcs entry") { t.Fatalf("expected storage readiness failure, got %v", err) } }) t.Run("startup-critical-workload-scale-error", func(t *testing.T) { cfg := lifecycleFastConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "scale deployment source-controller") { return "", errors.New("scale denied") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "critical-scale-fail"}) if err == nil || !strings.Contains(err.Error(), "scale flux-system/deployment/source-controller") { t.Fatalf("expected critical-workload scale failure, got %v", err) } }) t.Run("startup-flux-resume-failure", func(t *testing.T) { cfg := lifecycleFastConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io") && strings.Contains(command, "jsonpath={range .items[*]}{.metadata.name}") { return "", errors.New("kustomization list denied") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch, _ := newHookOrchestrator(t, cfg, run, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "resume-fail"}) if err == nil || !strings.Contains(err.Error(), "kustomization list denied") { t.Fatalf("expected flux-resume failure, got %v", err) } }) }