package orchestrator import ( "context" "errors" "io" "log" "net" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // newLifecycleMatrixOrchestrator runs one orchestration or CLI step. // Signature: newLifecycleMatrixOrchestrator(t *testing.T, cfg config.Config, dryRun bool, run commandOverride, runSensitive commandOverride, kubeconfig string) *cluster.Orchestrator. // Why: part11 needs direct control over runner dry-run and kubeconfig branches. func newLifecycleMatrixOrchestrator( t *testing.T, cfg config.Config, dryRun bool, run func(context.Context, time.Duration, string, ...string) (string, error), runSensitive func(context.Context, time.Duration, string, ...string) (string, error), kubeconfig string, ) *cluster.Orchestrator { t.Helper() if err := os.MkdirAll(cfg.State.Dir, 0o755); err != nil { t.Fatalf("ensure state dir: %v", err) } runner := &execx.Runner{DryRun: dryRun, Kubeconfig: kubeconfig} orch := cluster.New(cfg, runner, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) if run == nil || runSensitive == nil { recorder := &commandRecorder{} base := lifecycleDispatcher(recorder) if run == nil { run = base } if runSensitive == nil { runSensitive = base } } orch.SetCommandOverrides(run, runSensitive) return orch } // TestHookGapMatrixPart11RemainingClosure runs one orchestration or CLI step. // Signature: TestHookGapMatrixPart11RemainingClosure(t *testing.T). // Why: closes final branch gaps for lifecycle + remaining near-threshold // orchestrator files so per-file coverage reaches the enforced 95% target. func TestHookGapMatrixPart11RemainingClosure(t *testing.T) { t.Run("critical-vault-final-closures", func(t *testing.T) { t.Run("ensure-critical-cleanup-error-and-cleanup-branches", func(t *testing.T) { cfg := lifecycleConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get pods -o custom-columns"): return "", errors.New("pods query failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") if err := orch.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "cleanup stale pods") { t.Fatalf("expected ensureCriticalStartupWorkloads cleanup error branch, got %v", err) } dry := newLifecycleMatrixOrchestrator(t, cfg, true, nil, nil, "") if err := dry.TestHookCleanupStaleCriticalWorkloadPods(context.Background(), "vault", "statefulset", "vault"); err != nil { t.Fatalf("expected cleanup stale pod dry-run branch, got %v", err) } runCleanup := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get pods -o custom-columns") { return strings.Join([]string{ "badline", "vault-0 Unknown Deployment vault", "otherpod Unknown StatefulSet vault", }, "\n"), nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchCleanup := newLifecycleMatrixOrchestrator(t, cfg, false, runCleanup, runCleanup, "") if err := orchCleanup.TestHookCleanupStaleCriticalWorkloadPods(context.Background(), "vault", "statefulset", "vault"); err != nil { t.Fatalf("expected cleanup stale pod parse/owner/prefix branches, got %v", err) } }) t.Run("wait-vault-ready-error-tracking-and-ensure-unseal-branches", func(t *testing.T) { cfg := lifecycleConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "jsonpath={.status.readyReplicas}"): return "", errors.New("ready query failed") case name == "kubectl" && strings.Contains(command, "get pod vault-0"): return "Running", nil case name == "kubectl" && strings.Contains(command, "vault status -format=json"): return `{"sealed":false}`, nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") ctx, cancel := context.WithCancel(context.Background()) cancel() if err := orch.TestHookWaitVaultReady(ctx, "vault", "statefulset", "vault"); !errors.Is(err, context.Canceled) { t.Fatalf("expected canceled vault wait branch with error tracking, got %v", err) } dry := newLifecycleMatrixOrchestrator(t, cfg, true, nil, nil, "") if err := dry.TestHookEnsureVaultUnsealed(context.Background()); err != nil { t.Fatalf("expected ensureVaultUnsealed dry-run fast path, got %v", err) } runSealedErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get pod vault-0"): return "Running", nil case name == "kubectl" && strings.Contains(command, "vault status -format=json"): return "", errors.New("vault status query failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchSealedErr := newLifecycleMatrixOrchestrator(t, cfg, false, runSealedErr, runSealedErr, "") if err := orchSealedErr.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "vault status check failed") { t.Fatalf("expected ensureVaultUnsealed sealed-check error branch, got %v", err) } }) t.Run("parse-vault-sealed-json-unmarshal-error", func(t *testing.T) { if _, err := cluster.TestHookParseVaultSealed(`{"sealed":tru}`); err == nil { t.Fatalf("expected parseVaultSealed json-unmarshal error branch") } }) }) t.Run("ingress-service-storage-timesync-final-closures", func(t *testing.T) { t.Run("startup-convergence-workload-and-stability-failure-branches", func(t *testing.T) { cfgWorkload := lifecycleConfig(t) cfgWorkload.Startup.RequireIngressChecklist = false cfgWorkload.Startup.RequireServiceChecklist = false cfgWorkload.Startup.RequireCriticalServiceEndpoints = false cfgWorkload.Startup.RequireFluxHealth = false cfgWorkload.Startup.RequireWorkloadConvergence = true cfgWorkload.Startup.ServiceChecklistStabilitySec = 0 runWorkload := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json") { return "", errors.New("controllers query failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchWorkload := newLifecycleMatrixOrchestrator(t, cfgWorkload, false, runWorkload, runWorkload, "") if err := orchWorkload.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "query controllers") { t.Fatalf("expected startup convergence workload failure branch, got %v", err) } cfgStability := lifecycleConfig(t) cfgStability.Startup.RequireIngressChecklist = false cfgStability.Startup.RequireServiceChecklist = false cfgStability.Startup.RequireCriticalServiceEndpoints = false cfgStability.Startup.RequireFluxHealth = false cfgStability.Startup.RequireWorkloadConvergence = false cfgStability.Startup.ServiceChecklistStabilitySec = 1 cfgStability.Startup.ServiceChecklistPollSeconds = 1 runStability := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get pods -A -o json") { return "", errors.New("pods query failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchStability := newLifecycleMatrixOrchestrator(t, cfgStability, false, runStability, runStability, "") if err := orchStability.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "stability window failed") { t.Fatalf("expected startup convergence stability failure branch, got %v", err) } }) t.Run("ingress-host-discovery-and-autoheal-early-returns", func(t *testing.T) { cfg := lifecycleConfig(t) runHostsErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get ingress -A -o json") { return "", errors.New("ingress list failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchHostsErr := newLifecycleMatrixOrchestrator(t, cfg, false, runHostsErr, runHostsErr, "") if _, err := orchHostsErr.TestHookDiscoverIngressHosts(context.Background()); err == nil || !strings.Contains(err.Error(), "query ingresses") { t.Fatalf("expected discoverIngressHosts query error branch, got %v", err) } if _, detail := orchHostsErr.TestHookIngressChecklistReady(context.Background()); !strings.Contains(detail, "query ingresses") { t.Fatalf("expected ingress checklist to surface discovery error detail, got %q", detail) } runNs := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get ingress -A -o json") { return `{"items":[{"metadata":{"namespace":""},"spec":{"rules":[{"host":""},{"host":"logs.bstein.dev"}]}}]}`, nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchNs := newLifecycleMatrixOrchestrator(t, cfg, false, runNs, runNs, "") namespaces, err := orchNs.TestHookDiscoverIngressNamespacesForHost(context.Background(), "logs.bstein.dev") if err != nil || len(namespaces) != 0 { t.Fatalf("expected namespace-empty/rule-empty skip branches, namespaces=%v err=%v", namespaces, err) } dry := newLifecycleMatrixOrchestrator(t, cfg, true, nil, nil, "") now := time.Now().UTC().Add(-time.Hour) dry.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &now, "logs.bstein.dev: 502") orchEarly := newLifecycleMatrixOrchestrator(t, cfg, false, nil, nil, "") orchEarly.TestHookMaybeAutoHealIngressHostBackends(context.Background(), &now, "no-host-here") if host := orchEarly.TestHookChecklistFailureHost("https://logs.bstein.dev/login"); host != "" { t.Fatalf("expected checklistFailureHost colon-split edge branch, got %q", host) } }) t.Run("service-stability-error-branches", func(t *testing.T) { cfg := lifecycleConfig(t) runWorkloadsErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"): return `{"items":[{"metadata":{"namespace":"monitoring"},"spec":{"rules":[{"host":"logs.bstein.dev"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"): return "", errors.New("workload list failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchWorkloadsErr := newLifecycleMatrixOrchestrator(t, cfg, false, runWorkloadsErr, runWorkloadsErr, "") if _, err := orchWorkloadsErr.TestHookHealIngressHostBackendReplicas(context.Background(), "logs.bstein.dev"); err == nil || !strings.Contains(err.Error(), "query workloads") { t.Fatalf("expected healIngressHostBackendReplicas query error branch, got %v", err) } runDecodeErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"): return `{"items":[{"metadata":{"namespace":"monitoring"},"spec":{"rules":[{"host":"logs.bstein.dev"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"): return "{not-json", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchDecodeErr := newLifecycleMatrixOrchestrator(t, cfg, false, runDecodeErr, runDecodeErr, "") if _, err := orchDecodeErr.TestHookHealIngressHostBackendReplicas(context.Background(), "logs.bstein.dev"); err == nil || !strings.Contains(err.Error(), "decode workloads") { t.Fatalf("expected healIngressHostBackendReplicas decode error branch, got %v", err) } runBodyErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchBodyErr := newLifecycleMatrixOrchestrator(t, cfg, false, runBodyErr, runBodyErr, "") ln, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("open probe listener: %v", err) } defer ln.Close() go func() { conn, acceptErr := ln.Accept() if acceptErr == nil { _, _ = conn.Write([]byte("HTTP/1.1 200 OK\r\nContent-Length: 100\r\n\r\nshort")) _ = conn.Close() } }() _, _, probeErr := orchBodyErr.TestHookHTTPChecklistProbe(context.Background(), config.ServiceChecklistCheck{ URL: "http://" + ln.Addr().String() + "/health", }) if probeErr == nil || (!strings.Contains(probeErr.Error(), "read response body") && !strings.Contains(probeErr.Error(), "request failed")) { t.Fatalf("expected checklist probe failure branch, got %v", probeErr) } cfgStability := lifecycleConfig(t) cfgStability.Startup.RequireFluxHealth = false cfgStability.Startup.RequireWorkloadConvergence = true cfgStability.Startup.RequireServiceChecklist = false cfgStability.Startup.RequireIngressChecklist = false runStability := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json") { return "", errors.New("controllers failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchStability := newLifecycleMatrixOrchestrator(t, cfgStability, false, runStability, runStability, "") if err := orchStability.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "workload check error") { t.Fatalf("expected startupStabilityHealthy workload-error branch, got %v", err) } cfgWindow := lifecycleConfig(t) cfgWindow.Startup.ServiceChecklistStabilitySec = 1 cfgWindow.Startup.ServiceChecklistPollSeconds = 1 cfgWindow.Startup.RequireFluxHealth = false cfgWindow.Startup.RequireWorkloadConvergence = false cfgWindow.Startup.RequireServiceChecklist = false cfgWindow.Startup.RequireIngressChecklist = false orchWindow := newLifecycleMatrixOrchestrator(t, cfgWindow, false, nil, nil, "") ctx, cancel := context.WithCancel(context.Background()) cancel() if err := orchWindow.TestHookWaitForStabilityWindow(ctx); !errors.Is(err, context.Canceled) { t.Fatalf("expected waitForStabilityWindow ctx-canceled branch, got %v", err) } }) t.Run("storage-and-timesync-small-branches", func(t *testing.T) { cfgStorage := lifecycleConfig(t) cfgStorage.Startup.StorageMinReadyNodes = 1 orchStorageDry := newLifecycleMatrixOrchestrator(t, cfgStorage, true, nil, nil, "") if err := orchStorageDry.TestHookWaitForStorageReady(context.Background()); err != nil { t.Fatalf("expected waitForStorageReady dry-run branch, got %v", err) } cfgStorage.Startup.StorageCriticalPVCs = []string{" "} runStorage := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get nodes.longhorn.io") { return "malformed\nnode-a:True:True\n", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchStorage := newLifecycleMatrixOrchestrator(t, cfgStorage, false, runStorage, runStorage, "") ready, _, err := orchStorage.TestHookStorageReady(context.Background()) if err != nil || !ready { t.Fatalf("expected storage malformed-line and empty-pvc-entry branches, ready=%v err=%v", ready, err) } cfgSync := lifecycleConfig(t) cfgSync.Startup.TimeSyncMode = "quorum" cfgSync.Startup.TimeSyncQuorum = 99 cfgSync.SSHManagedNodes = []string{"titan-db"} runSync := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "yes", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized"): return "yes", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchSync := newLifecycleMatrixOrchestrator(t, cfgSync, false, runSync, runSync, "") if err := orchSync.TestHookWaitForTimeSync(context.Background(), []string{"", "titan-db", "unmanaged"}); err != nil { t.Fatalf("expected timesync quorum clamp/success branches, got %v", err) } cfgSyncBad := lifecycleConfig(t) cfgSyncBad.SSHPort = 22 cfgSyncBad.SSHUser = "" cfgSyncBad.ControlPlanes = []string{"", "cp1"} cfgSyncBad.Workers = []string{"", "wk1"} cfgSyncBad.SSHManagedNodes = []string{"cp1"} cfgSyncBad.SSHNodeHosts = map[string]string{ "cp1": "bad host", "wk1": "wk1", } orchSyncBad := newLifecycleMatrixOrchestrator(t, cfgSyncBad, false, nil, nil, "") if err := orchSyncBad.TestHookValidateNodeInventory(); err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") { t.Fatalf("expected validateNodeInventory host/user branches, got %v", err) } orchSyncDry := newLifecycleMatrixOrchestrator(t, lifecycleConfig(t), true, nil, nil, "") if err := orchSyncDry.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"}); err != nil { t.Fatalf("expected waitForTimeSync dry-run branch, got %v", err) } }) }) t.Run("poststart-scaling-and-lifecycle-final-closures", func(t *testing.T) { t.Run("poststart-ssh-and-sensitive-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.SSHManagedNodes = nil cfg.SSHNodeUsers = map[string]string{"titan-db": "special"} cfg.SSHJumpHost = "jump-a" cfg.SSHJumpUser = "jumper" cfg.SSHNodeHosts["jump-a"] = "jump.example.internal" attempt := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { if name == "ssh" { attempt++ if attempt <= 2 { return "REMOTE HOST IDENTIFICATION HAS CHANGED!", errors.New("host key mismatch") } return "", errors.New("still failing") } if name == "curl" { return "200", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "/tmp/fake-kubeconfig") if _, err := orch.TestHookSSHWithTimeout(context.Background(), "titan-db", "echo ok", 2*time.Second); err == nil { t.Fatalf("expected sshWithTimeout retry-failure branch") } if !orch.TestHookSSHManaged("any-node") { t.Fatalf("expected sshManaged empty-allowlist branch") } if _, err := orch.TestHookRunSensitive(context.Background(), 2*time.Second, "sh", "-lc", "echo ok"); err != nil { t.Fatalf("expected runSensitive kubeconfig-env branch, got %v", err) } }) t.Run("scaling-final-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Workers = nil runEffective := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" { return "wk1 ", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchEffective := newLifecycleMatrixOrchestrator(t, cfg, false, runEffective, runEffective, "") workers, err := orchEffective.TestHookEffectiveWorkers(context.Background()) if err != nil || len(workers) != 1 || workers[0] != "wk1" { t.Fatalf("expected effectiveWorkers discover-success branch, workers=%v err=%v", workers, err) } runNoWorkers := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" { return "cp1 true ", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchNoWorkers := newLifecycleMatrixOrchestrator(t, cfg, false, runNoWorkers, runNoWorkers, "") if _, err := orchNoWorkers.TestHookDiscoverWorkers(context.Background()); err == nil || !strings.Contains(err.Error(), "no workers discovered") { t.Fatalf("expected discoverWorkers no-workers branch, got %v", err) } runPatch := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io"): return "apps\n", nil case name == "kubectl" && strings.Contains(command, "get helmreleases.helm.toolkit.fluxcd.io -A"): return "", errors.New("helmrelease list failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchPatch := newLifecycleMatrixOrchestrator(t, cfg, false, runPatch, runPatch, "") if err := orchPatch.TestHookPatchFluxSuspendAll(context.Background(), true); err == nil || !strings.Contains(err.Error(), "helmrelease list failed") { t.Fatalf("expected patchFluxSuspendAll helmrelease-query error branch, got %v", err) } cfgScale := lifecycleConfig(t) cfgScale.Shutdown.ScaleParallelism = 0 runScale := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="): return "monitoring\tgrafana\t1\nbad-line\n", nil case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="): return "", errors.New("statefulset query failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchScale := newLifecycleMatrixOrchestrator(t, cfgScale, false, runScale, runScale, "") if _, err := orchScale.TestHookListScalableWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "collect statefulsets") { t.Fatalf("expected listScalableWorkloads malformed/statefulset-error branches, got %v", err) } cfgRestore := lifecycleConfig(t) cfgRestore.Shutdown.ScaleParallelism = 0 if err := os.MkdirAll(cfgRestore.State.Dir, 0o755); err != nil { t.Fatalf("mkdir restore state dir: %v", err) } snapshotPath := filepath.Join(cfgRestore.State.Dir, "scaled-workloads.json") if err := os.WriteFile(snapshotPath, []byte(`{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}`), 0o644); err != nil { t.Fatalf("write restore snapshot: %v", err) } runRestore := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "scale deployment grafana --replicas=1") { return "", errors.New("scale restore failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchRestore := newLifecycleMatrixOrchestrator(t, cfgRestore, false, runRestore, runRestore, "") if err := orchRestore.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "scaling had 1 errors") { t.Fatalf("expected restoreScaledApps scale error branch, got %v", err) } orchRestoreDry := newLifecycleMatrixOrchestrator(t, cfgRestore, true, runRestore, runRestore, "") if err := orchRestoreDry.TestHookRestoreScaledApps(context.Background()); err != nil { t.Fatalf("expected restoreScaledApps dry-run branch, got %v", err) } }) t.Run("lifecycle-startup-etcd-shutdown-final-branches", func(t *testing.T) { t.Run("startup-cooldown-and-poststart-failure-paths", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.ShutdownCooldownSeconds = 300 cfg.Startup.RequirePostStartProbes = true cfg.Startup.PostStartProbes = []string{"https://logs.bstein.dev/health"} if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShutdownComplete, Reason: "recent-shutdown", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("seed cooldown intent: %v", err) } orchCooldown := newLifecycleMatrixOrchestrator(t, cfg, false, nil, nil, "") ctxCooldown, cancelCooldown := context.WithCancel(context.Background()) cancelCooldown() if err := orchCooldown.Startup(ctxCooldown, cluster.StartupOptions{Reason: "cooldown"}); err == nil || !strings.Contains(err.Error(), "startup canceled while waiting for shutdown cooldown") { t.Fatalf("expected startup cooldown cancel branch, got %v", err) } cfgPost := lifecycleFastConfig(t) cfgPost.Startup.RequirePostStartProbes = true cfgPost.Startup.PostStartProbes = []string{"https://logs.bstein.dev/health"} cfgPost.Startup.PostStartProbeWaitSeconds = 1 cfgPost.Startup.PostStartProbePollSeconds = 1 cfgPost.Startup.APIWaitSeconds = 1 cfgPost.Startup.APIPollSeconds = 5 runPost := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { if name == "curl" { return "", errors.New("probe failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchPost := newLifecycleMatrixOrchestrator(t, cfgPost, false, runPost, runPost, "") if err := orchPost.Startup(context.Background(), cluster.StartupOptions{Reason: "post-start"}); err == nil || !strings.Contains(err.Error(), "post-start probes") { t.Fatalf("expected startup post-start probe failure branch, got %v", err) } }) t.Run("startup-bootstrap-and-flux-branch-error-paths", func(t *testing.T) { cfg := lifecycleFastConfig(t) repo := t.TempDir() cfg.IACRepoPath = repo cfg.LocalBootstrapPaths = []string{"services/bootstrap"} if err := os.MkdirAll(filepath.Join(repo, ".git"), 0o755); err != nil { t.Fatalf("mkdir .git: %v", err) } if err := os.MkdirAll(filepath.Join(repo, "services", "bootstrap"), 0o755); err != nil { t.Fatalf("mkdir bootstrap dir: %v", err) } readyCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "", errors.New("branch read failed") case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"): readyCalls++ if readyCalls == 1 { return "", nil } return "True", nil case name == "git": return "", nil case name == "sh" && strings.Contains(command, "kubectl kustomize"): return "apiVersion: v1\nkind: Namespace\nmetadata:\n name: bootstrap\n", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "flux-branch-error"}); err == nil || !strings.Contains(err.Error(), "read flux source branch") { t.Fatalf("expected startup ensureFluxBranch error branch, got %v", err) } }) t.Run("startup-intent-clear-and-auto-etcd-flux-patch-branches", func(t *testing.T) { t.Run("clear-stale-startup-intent-success-path", func(t *testing.T) { cfg := lifecycleFastConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentStartupInProgress, Reason: "stale-startup", Source: "test", UpdatedAt: time.Now().UTC(), }); err != nil { t.Fatalf("seed stale startup intent: %v", err) } run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") { return "", errors.New("flux url read failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "stale-startup-clear"}); err == nil || !strings.Contains(err.Error(), "read flux source url") { t.Fatalf("expected startup flux-url guard error after stale startup clear, got %v", err) } }) t.Run("clear-stale-shutdown-intent-success-path", func(t *testing.T) { cfg := lifecycleFastConfig(t) if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{ State: state.IntentShuttingDown, Reason: "stale-shutdown", Source: "test", UpdatedAt: time.Now().UTC().Add(-2 * time.Hour), }); err != nil { t.Fatalf("seed stale shutdown intent: %v", err) } run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") { return "", errors.New("flux url read failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "stale-shutdown-clear"}); err == nil || !strings.Contains(err.Error(), "read flux source url") { t.Fatalf("expected startup flux-url guard error after stale shutdown clear, got %v", err) } }) t.Run("auto-etcd-default-control-plane-selection", func(t *testing.T) { cfg := lifecycleFastConfig(t) cfg.Startup.AutoEtcdRestoreOnAPIFailure = true apiVersionCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): apiVersionCalls++ if apiVersionCalls == 1 { return "", errors.New("api down") } return "v1.31.0", nil case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "", errors.New("unit read failed") case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}"): return "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git", nil case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "main", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "auto-etcd-default-cp"}) if err == nil || !strings.Contains(err.Error(), "automatic etcd restore failed") { t.Fatalf("expected startup auto-etcd restore failure path, got %v", err) } }) t.Run("ensure-flux-branch-patch-error", func(t *testing.T) { cfg := lifecycleFastConfig(t) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"): return "v1.31.0", nil case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}"): return "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git", nil case name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}"): return "feature/sso", nil case name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system"): return "", errors.New("branch patch failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch := newLifecycleMatrixOrchestrator(t, cfg, false, run, run, "") err := orch.Startup(context.Background(), cluster.StartupOptions{ Reason: "ensure-flux-branch-patch", ForceFluxBranch: "main", }) if err == nil || !strings.Contains(err.Error(), "set flux source branch") { t.Fatalf("expected startup ensureFluxBranch patch error path, got %v", err) } }) }) t.Run("etcd-restore-and-shutdown-error-paths", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = []string{"titan-db", "titan-24"} cfg.SSHManagedNodes = []string{"titan-db", "titan-24", "titan-23"} cfg.SSHNodeHosts["titan-24"] = "titan-24" cfg.SSHNodeHosts["titan-23"] = "titan-23" runEtcdModeErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "systemctl cat k3s") { return "", errors.New("unit read failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchEtcdModeErr := newLifecycleMatrixOrchestrator(t, cfg, false, runEtcdModeErr, runEtcdModeErr, "") if err := orchEtcdModeErr.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil || !strings.Contains(err.Error(), "inspect k3s service on titan-db for datastore mode") { t.Fatalf("expected EtcdRestore datastore-mode error branch, got %v", err) } runLatestErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server", nil case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"): return "", errors.New("snapshot list failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchLatestErr := newLifecycleMatrixOrchestrator(t, cfg, false, runLatestErr, runLatestErr, "") if err := orchLatestErr.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil || !strings.Contains(err.Error(), "resolve latest etcd snapshot") { t.Fatalf("expected EtcdRestore latest-snapshot error branch, got %v", err) } runVerifyErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server", nil case name == "ssh" && strings.Contains(command, "stat -c %s"): return "1", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchVerifyErr := newLifecycleMatrixOrchestrator(t, cfg, false, runVerifyErr, runVerifyErr, "") if err := orchVerifyErr.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ ControlPlane: "titan-db", SnapshotPath: "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", }); err == nil || !strings.Contains(err.Error(), "snapshot too small") { t.Fatalf("expected EtcdRestore verify-snapshot error branch, got %v", err) } runStartErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): return "ExecStart=/usr/local/bin/k3s server", nil case name == "ssh" && strings.Contains(command, "stat -c %s"): return "2097152", nil case name == "ssh" && strings.Contains(command, "sha256sum"): return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"): return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil case name == "ssh" && strings.Contains(command, "server --cluster-reset"): return "", nil case name == "ssh" && strings.Contains(command, "sudo systemctl start k3s || true") && strings.Contains(command, "atlas@titan-db"): return "", errors.New("start failed") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchStartErr := newLifecycleMatrixOrchestrator(t, cfg, false, runStartErr, runStartErr, "") if err := orchStartErr.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ ControlPlane: "titan-db", SnapshotPath: "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", }); err == nil || !strings.Contains(err.Error(), "failed to start k3s on restore control plane") { t.Fatalf("expected EtcdRestore restore-node start error branch, got %v", err) } cfgShutdownLock := lifecycleConfig(t) cfgShutdownLock.State.LockPath = t.TempDir() orchShutdownLock := newLifecycleMatrixOrchestrator(t, cfgShutdownLock, false, nil, nil, "") if err := orchShutdownLock.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "lock"}); err == nil { t.Fatalf("expected shutdown lock acquire failure branch") } cfgShutdownInventory := lifecycleConfig(t) cfgShutdownInventory.Workers = nil cfgShutdownInventory.SSHManagedNodes = []string{"titan-db"} cfgShutdownInventory.SSHNodeHosts = map[string]string{ "titan-db": "titan-db", } runWorkersErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" { return "", errors.New("discover workers failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchWorkersErr := newLifecycleMatrixOrchestrator(t, cfgShutdownInventory, false, runWorkersErr, runWorkersErr, "") if err := orchWorkersErr.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "workers"}); err == nil || !strings.Contains(err.Error(), "discover workers") { t.Fatalf("expected shutdown effectiveWorkers error branch, got %v", err) } }) }) }) }