package orchestrator import ( "context" "errors" "io" "log" "net" "os" "strconv" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // newLifecycleSaturationOrchestrator runs one orchestration or CLI step. // Signature: newLifecycleSaturationOrchestrator(t *testing.T, cfg config.Config, run commandOverride) *cluster.Orchestrator. // Why: lifecycle branch saturation needs deterministic command behavior while preserving real intent/lock/file semantics. func newLifecycleSaturationOrchestrator( t *testing.T, cfg config.Config, run func(context.Context, time.Duration, string, ...string) (string, error), ) *cluster.Orchestrator { t.Helper() if run == nil { run = lifecycleDispatcher(&commandRecorder{}) } orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(run, run) return orch } // TestLifecycleStartupBranchSaturation runs one orchestration or CLI step. // Signature: TestLifecycleStartupBranchSaturation(t *testing.T). // Why: drives startup through its main error/safety branches so lifecycle coverage // reflects realistic drill failure modes. func TestLifecycleStartupBranchSaturation(t *testing.T) { t.Run("read-intent-error-branch", func(t *testing.T) { cfg := lifecycleConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentNormal, Reason: "seed", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("seed intent: %v", err) } // Replace intent file with directory so ReadIntent fails. if err := osRemove(cfg.State.IntentPath); err != nil { t.Fatalf("remove intent file: %v", err) } if err := osMkdir(cfg.State.IntentPath); err != nil { t.Fatalf("make intent dir: %v", err) } orch := newLifecycleSaturationOrchestrator(t, cfg, nil) if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-read-error"}); err == nil { t.Fatalf("expected startup to fail when intent path is unreadable") } }) t.Run("fresh-shutdown-intent-blocks", func(t *testing.T) { cfg := lifecycleConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShuttingDown, Reason: "active-shutdown", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("write shutdown intent: %v", err) } orch := newLifecycleSaturationOrchestrator(t, cfg, nil) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "blocked-by-shutdown"}) if err == nil || !strings.Contains(err.Error(), "startup blocked: shutdown intent is active") { t.Fatalf("expected active shutdown intent block, got %v", err) } }) t.Run("cooldown-cancel-branch", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.ShutdownCooldownSeconds = 20 if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShutdownComplete, Reason: "just-finished", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("write cooldown intent: %v", err) } orch := newLifecycleSaturationOrchestrator(t, cfg, nil) ctx, cancel := context.WithCancel(context.Background()) go func() { time.Sleep(20 * time.Millisecond) cancel() }() err := orch.Startup(ctx, cluster.StartupOptions{Reason: "cooldown-cancel"}) if err == nil || !strings.Contains(err.Error(), "startup canceled while waiting for shutdown cooldown") { t.Fatalf("expected cooldown cancel branch, got %v", err) } }) t.Run("api-failure-without-restore", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.AutoEtcdRestoreOnAPIFailure = false run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") { return "", errors.New("apiserver down") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch := newLifecycleSaturationOrchestrator(t, cfg, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-fail-no-restore"}) if err == nil || !strings.Contains(err.Error(), "kubernetes API did not become reachable") { t.Fatalf("expected api wait failure, got %v", err) } }) t.Run("api-failure-restore-not-applicable-retries", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.AutoEtcdRestoreOnAPIFailure = true cfg.Startup.EtcdRestoreControlPlane = "titan-db" l, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("open local datastore listener: %v", err) } defer l.Close() port := l.Addr().(*net.TCPAddr).Port attempt := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): attempt++ if attempt <= 1 { return "", errors.New("apiserver down") } return "v1.31.0", nil case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:" + strconv.Itoa(port) + "/k3s", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleSaturationOrchestrator(t, cfg, run) if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-restore-not-applicable"}); err != nil { t.Fatalf("expected startup success after retry, got %v", err) } }) t.Run("bootstrap-required-and-cache-missing-fails", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.IACRepoPath = t.TempDir() cfg.LocalBootstrapPaths = []string{"services/bootstrap"} run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"): return "", errors.New("flux source unavailable") case name == "kubectl" && strings.Contains(command, " apply -k "): return "", errors.New("apply failed") case name == "sh" && strings.Contains(command, "kubectl kustomize"): return "", errors.New("fallback failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleSaturationOrchestrator(t, cfg, run) err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-required"}) if err == nil || !strings.Contains(err.Error(), "local bootstrap apply failed") { t.Fatalf("expected bootstrap failure, got %v", err) } }) } // TestLifecycleEtcdRestoreAndShutdownBranchSaturation runs one orchestration or CLI step. // Signature: TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T). // Why: covers restore/shutdown branch paths that are difficult to hit from a single happy-path drill. func TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T) { t.Run("etcd-restore-input-validation", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = nil orch := newLifecycleSaturationOrchestrator(t, cfg, nil) if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil { t.Fatalf("expected restore error with no control planes") } }) t.Run("etcd-restore-unmanaged-node", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.SSHManagedNodes = []string{"titan-23"} orch := newLifecycleSaturationOrchestrator(t, cfg, nil) if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil { t.Fatalf("expected unmanaged control plane restore error") } }) t.Run("etcd-restore-command-failure", func(t *testing.T) { cfg := lifecycleConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server", nil case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"): return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil case name == "ssh" && strings.Contains(command, "stat -c %s"): return "2097152", nil case name == "ssh" && strings.Contains(command, "sha256sum"): return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil case name == "ssh" && strings.Contains(command, "server --cluster-reset"): return "", errors.New("cluster reset failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleSaturationOrchestrator(t, cfg, run) err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}) if err == nil || !strings.Contains(err.Error(), "etcd restore command failed") { t.Fatalf("expected restore command failure branch, got %v", err) } }) t.Run("shutdown-invalid-mode", func(t *testing.T) { cfg := lifecycleConfig(t) orch := newLifecycleSaturationOrchestrator(t, cfg, nil) if err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "bad-mode", Mode: "invalid"}); err == nil { t.Fatalf("expected shutdown mode validation error") } }) } // osRemove runs one orchestration or CLI step. // Signature: osRemove(path string) error. // Why: keeps error handling explicit in lifecycle branch tests without repeated ignore logic. func osRemove(path string) error { return os.Remove(path) } // osMkdir runs one orchestration or CLI step. // Signature: osMkdir(path string) error. // Why: keeps branch setup concise in lifecycle branch tests. func osMkdir(path string) error { return os.Mkdir(path, 0o755) }