ananke/testing/orchestrator/drill_startup_paths_test.go

package orchestrator

import (
	"context"
	"encoding/base64"
	"fmt"
	"io"
	"log"
	"net/http"
	"net/http/httptest"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// newLifecycleOrchestratorWithConfig runs one orchestration or CLI step.
// Signature: newLifecycleOrchestratorWithConfig(t *testing.T, cfg config.Config, run commandOverride, runSensitive commandOverride) (*cluster.Orchestrator, *commandRecorder).
// Why: centralizes top-level orchestrator construction so branch-heavy startup tests stay focused on intent.
func newLifecycleOrchestratorWithConfig(
	t *testing.T,
	cfg config.Config,
	run func(context.Context, time.Duration, string, ...string) (string, error),
	runSensitive func(context.Context, time.Duration, string, ...string) (string, error),
) (*cluster.Orchestrator, *commandRecorder) {
	t.Helper()
	if err := os.MkdirAll(cfg.State.Dir, 0o755); err != nil {
		t.Fatalf("ensure state dir: %v", err)
	}
	recorder := &commandRecorder{}
	if run == nil || runSensitive == nil {
		base := lifecycleDispatcher(recorder)
		if run == nil {
			run = base
		}
		if runSensitive == nil {
			runSensitive = base
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, runSensitive)
	return orch, recorder
}

// TestLifecycleStartupBootstrapFallbackWithPeerGuard runs one orchestration or CLI step.
// Signature: TestLifecycleStartupBootstrapFallbackWithPeerGuard(t *testing.T).
// Why: covers peer intent guard, access reconciliation, local bootstrap fallback, and cache apply branches.
func TestLifecycleStartupBootstrapFallbackWithPeerGuard(t *testing.T) {
	cfg := lifecycleConfig(t)
	repo := t.TempDir()
	cfg.IACRepoPath = repo
	cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
	cfg.Startup.ReconcileAccessOnBoot = true
	cfg.Startup.RequireNodeSSHAuth = true
	cfg.Startup.NodeSSHAuthWaitSeconds = 2
	cfg.Startup.NodeSSHAuthPollSeconds = 1
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	cfg.Coordination.Role = "coordinator"
	cfg.Coordination.PeerHosts = []string{"titan-24"}
	cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
	cfg.SSHNodeHosts["titan-24"] = "titan-24"

	if err := os.MkdirAll(filepath.Join(repo, ".git"), 0o755); err != nil {
		t.Fatalf("mkdir .git: %v", err)
	}
	if err := os.MkdirAll(filepath.Join(repo, "services", "bootstrap"), 0o755); err != nil {
		t.Fatalf("mkdir bootstrap path: %v", err)
	}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	fluxReadyChecks := 0
	cacheApplied := false

	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
			recorder.record(name, args)
			now := time.Now().UTC().Format(time.RFC3339)
			return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=startup_in_progress reason=\"manual-startup\" source=peer updated_at=" + now + "\n", nil
		case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version"):
			recorder.record(name, args)
			return "systemd 252", nil
		case name == "kubectl" &&
			strings.Contains(command, "get gitrepository flux-system") &&
			strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
			recorder.record(name, args)
			fluxReadyChecks++
			if fluxReadyChecks <= 2 {
				return "", fmt.Errorf("flux source not ready")
			}
			return "True", nil
		case name == "kubectl" && strings.Contains(command, " apply -k "):
			recorder.record(name, args)
			return "", fmt.Errorf("apply -k failed")
		case name == "kubectl" && strings.Contains(command, " apply -f ") && strings.Contains(command, "bootstrap-cache"):
			recorder.record(name, args)
			cacheApplied = true
			return "", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "git" && strings.Contains(command, "status --porcelain"):
			recorder.record(name, args)
			return "", nil
		case name == "git" && strings.Contains(command, "fetch origin --prune"):
			recorder.record(name, args)
			return "", nil
		case name == "git" && strings.Contains(command, "checkout main"):
			recorder.record(name, args)
			return "", nil
		case name == "git" && strings.Contains(command, "reset --hard origin/main"):
			recorder.record(name, args)
			return "", nil
		case name == "sh" && strings.Contains(command, "kubectl kustomize"):
			recorder.record(name, args)
			if strings.Contains(command, "| kubectl apply -f -") {
				return "", fmt.Errorf("fallback kustomize pipe failed")
			}
			return "apiVersion: v1\nkind: Namespace\nmetadata:\n  name: bootstrap\n", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}

	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, runSensitive)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-fallback"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if !cacheApplied {
		t.Fatalf("expected bootstrap cache apply fallback to run")
	}
	if !recorder.contains("ananke intent --config /etc/ananke/ananke.yaml") {
		t.Fatalf("expected peer intent guard command in call log")
	}
}

// TestLifecycleStartupCoversTimeSyncStorageAndServiceChecklist runs one orchestration or CLI step.
// Signature: TestLifecycleStartupCoversTimeSyncStorageAndServiceChecklist(t *testing.T).
// Why: covers time-sync quorum path, storage readiness checks, and service checklist success path.
func TestLifecycleStartupCoversTimeSyncStorageAndServiceChecklist(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.ControlPlanes = []string{"titan-db", "titan-23"}
	cfg.Workers = []string{"titan-24"}
	cfg.SSHManagedNodes = []string{"titan-db", "titan-23", "titan-24"}
	cfg.SSHNodeHosts["titan-23"] = "titan-23"
	cfg.SSHNodeHosts["titan-24"] = "titan-24"
	cfg.Startup.RequireTimeSync = true
	cfg.Startup.TimeSyncMode = "quorum"
	cfg.Startup.TimeSyncQuorum = 1
	cfg.Startup.TimeSyncWaitSeconds = 2
	cfg.Startup.TimeSyncPollSeconds = 1
	cfg.Startup.RequireStorageReady = true
	cfg.Startup.StorageReadyWaitSeconds = 2
	cfg.Startup.StorageReadyPollSeconds = 1
	cfg.Startup.StorageMinReadyNodes = 2
	cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}
	cfg.Startup.RequireServiceChecklist = true
	cfg.Startup.ServiceChecklistWaitSeconds = 2
	cfg.Startup.ServiceChecklistPollSeconds = 1
	cfg.Startup.ServiceChecklistStabilitySec = 0
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(http.StatusOK)
		_, _ = w.Write([]byte(`{"status":"ok"}`))
	}))
	defer server.Close()
	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
		{
			Name:             "status",
			URL:              server.URL,
			AcceptedStatuses: []int{200},
			BodyContains:     `"status":"ok"`,
			TimeoutSeconds:   5,
		},
	}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
			recorder.record(name, args)
			return "yes", nil
		case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
			recorder.record(name, args)
			if strings.Contains(command, "titan-db") {
				return "yes", nil
			}
			return "no", nil
		case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
			recorder.record(name, args)
			return "lh-a:True:True\nlh-b:True:True\n", nil
		case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
			recorder.record(name, args)
			return "Bound", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}

	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-storage-service"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if !recorder.contains("nodes.longhorn.io") {
		t.Fatalf("expected longhorn readiness query in call log")
	}
	if !recorder.contains("timedatectl show -p NTPSynchronized") {
		t.Fatalf("expected timedatectl checks in call log")
	}
}

// TestLifecycleStartupFailsOnNodeSSHAuthDenied runs one orchestration or CLI step.
// Signature: TestLifecycleStartupFailsOnNodeSSHAuthDenied(t *testing.T).
// Why: verifies startup fails fast when SSH auth is denied for managed nodes.
func TestLifecycleStartupFailsOnNodeSSHAuthDenied(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireNodeSSHAuth = true
	cfg.Startup.NodeSSHAuthWaitSeconds = 1
	cfg.Startup.NodeSSHAuthPollSeconds = 1
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") {
			recorder.record(name, args)
			return "Permission denied (publickey)", fmt.Errorf("permission denied")
		}
		return base(ctx, timeout, name, args...)
	}

	orch, _ := newLifecycleOrchestratorWithConfig(t, cfg, run, run)
	err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "ssh-auth-denied"})
	if err == nil {
		t.Fatalf("expected startup to fail when ssh auth is denied")
	}
	if !strings.Contains(err.Error(), "ssh auth gate failed") {
		t.Fatalf("unexpected startup error: %v", err)
	}
}

// TestLifecycleStartupVaultAutoUnsealPath runs one orchestration or CLI step.
// Signature: TestLifecycleStartupVaultAutoUnsealPath(t *testing.T).
// Why: covers vault sealed detection, unseal key retrieval, and auto-unseal retry success.
func TestLifecycleStartupVaultAutoUnsealPath(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	vaultReadyChecks := 0
	vaultSealedChecks := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
			recorder.record(name, args)
			return "Running", nil
		case name == "kubectl" && strings.Contains(command, "-n vault get statefulset vault -o jsonpath={.status.readyReplicas}"):
			recorder.record(name, args)
			vaultReadyChecks++
			if vaultReadyChecks <= 2 {
				return "0", nil
			}
			return "1", nil
		case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
			recorder.record(name, args)
			vaultSealedChecks++
			if vaultSealedChecks <= 1 {
				return `{"sealed":true,"initialized":true}`, nil
			}
			return `{"sealed":false,"initialized":true}`, nil
		case name == "kubectl" && strings.Contains(command, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
			recorder.record(name, args)
			return base64.StdEncoding.EncodeToString([]byte("unseal-key")), nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "kubectl" && strings.Contains(command, "vault operator unseal unseal-key") {
			recorder.record(name, args)
			return "ok", nil
		}
		return run(ctx, timeout, name, args...)
	}

	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, runSensitive)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "vault-unseal"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if !recorder.contains("vault operator unseal unseal-key") {
		t.Fatalf("expected vault unseal command in call log")
	}
}

// TestLifecycleStartupServiceChecklistAutoHealsIngressBackends runs one orchestration or CLI step.
// Signature: TestLifecycleStartupServiceChecklistAutoHealsIngressBackends(t *testing.T).
// Why: covers service-checklist failure recovery by auto-healing ingress namespace backend replicas.
func TestLifecycleStartupServiceChecklistAutoHealsIngressBackends(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	cfg.Startup.RequireServiceChecklist = true
	cfg.Startup.ServiceChecklistWaitSeconds = 3
	cfg.Startup.ServiceChecklistPollSeconds = 1
	cfg.Startup.ServiceChecklistStabilitySec = 0

	requests := 0
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		requests++
		w.WriteHeader(http.StatusOK)
		if requests <= 1 {
			_, _ = w.Write([]byte(`{"database":"booting"}`))
			return
		}
		_, _ = w.Write([]byte(`{"database":"ok"}`))
	}))
	defer server.Close()
	cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
		{
			Name:             "grafana-check",
			URL:              server.URL,
			AcceptedStatuses: []int{200},
			BodyContains:     `"database":"ok"`,
			TimeoutSeconds:   5,
		},
	}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"rules":[{"host":"127.0.0.1"}]}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":0,"template":{"spec":{}}},"status":{"readyReplicas":0}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "-n monitoring scale deployment grafana --replicas=1"):
			recorder.record(name, args)
			return "", nil
		case name == "kubectl" && strings.Contains(command, "rollout status deployment/grafana"):
			recorder.record(name, args)
			return "rolled out", nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}

	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "service-heal"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if !recorder.contains("scale deployment grafana --replicas=1") {
		t.Fatalf("expected auto-heal scale command in call log")
	}
}

// TestLifecycleStartupWithPostStartProbes runs one orchestration or CLI step.
// Signature: TestLifecycleStartupWithPostStartProbes(t *testing.T).
// Why: covers post-start probe readiness checks in the final startup phase.
func TestLifecycleStartupWithPostStartProbes(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	cfg.Startup.RequirePostStartProbes = true
	cfg.Startup.PostStartProbeWaitSeconds = 2
	cfg.Startup.PostStartProbePollSeconds = 1

	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(http.StatusOK)
		_, _ = w.Write([]byte("ok"))
	}))
	defer server.Close()
	cfg.Startup.PostStartProbes = []string{server.URL}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		if name == "curl" {
			recorder.record(name, args)
			return "200", nil
		}
		return base(ctx, timeout, name, args...)
	}
	orch, _ := newLifecycleOrchestratorWithConfig(t, cfg, run, run)
	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "post-start-probes"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
}

// TestLifecycleStartupForceFluxBranchPatchesSource runs one orchestration or CLI step.
// Signature: TestLifecycleStartupForceFluxBranchPatchesSource(t *testing.T).
// Why: covers the intentional branch-patch path used during controlled startup override scenarios.
func TestLifecycleStartupForceFluxBranchPatchesSource(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	patched := false
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "kubectl" && strings.Contains(command, "patch gitrepository flux-system --type=merge") {
			recorder.record(name, args)
			patched = true
			return "", nil
		}
		return base(ctx, timeout, name, args...)
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{
		Reason:          "force-branch",
		ForceFluxBranch: "feature/sso",
	}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if !patched {
		t.Fatalf("expected forced branch startup to patch GitRepository branch")
	}
}

// TestLifecycleStartupPeerRecentShutdownBlocks runs one orchestration or CLI step.
// Signature: TestLifecycleStartupPeerRecentShutdownBlocks(t *testing.T).
// Why: validates peer coordination cooldown guard before startup proceeds.
func TestLifecycleStartupPeerRecentShutdownBlocks(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Coordination.PeerHosts = []string{"titan-24"}
	cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
	cfg.SSHNodeHosts["titan-24"] = "titan-24"

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") {
			recorder.record(name, args)
			now := time.Now().UTC().Format(time.RFC3339)
			return "__ANANKE_BOOTSTRAP_IDLE__\nintent=shutdown_complete reason=\"drill\" source=peer updated_at=" + now + "\n", nil
		}
		return base(ctx, timeout, name, args...)
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "peer-cooldown"})
	if err == nil {
		t.Fatalf("expected startup to be blocked by recent peer shutdown completion")
	}
	if !strings.Contains(err.Error(), "completed shutdown too recently") {
		t.Fatalf("unexpected startup error: %v", err)
	}
}

// TestLifecycleStartupFailsWhenAllBootstrapPathsFail runs one orchestration or CLI step.
// Signature: TestLifecycleStartupFailsWhenAllBootstrapPathsFail(t *testing.T).
// Why: covers bootstrap fallback failure reporting when apply, render/apply, and cache paths all fail.
func TestLifecycleStartupFailsWhenAllBootstrapPathsFail(t *testing.T) {
	cfg := lifecycleConfig(t)
	repo := t.TempDir()
	cfg.IACRepoPath = repo
	cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	if err := os.MkdirAll(filepath.Join(repo, ".git"), 0o755); err != nil {
		t.Fatalf("mkdir .git: %v", err)
	}
	if err := os.MkdirAll(filepath.Join(repo, "services", "bootstrap"), 0o755); err != nil {
		t.Fatalf("mkdir bootstrap path: %v", err)
	}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	fluxReadyChecks := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" &&
			strings.Contains(command, "get gitrepository flux-system") &&
			strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
			recorder.record(name, args)
			fluxReadyChecks++
			if fluxReadyChecks <= 3 {
				return "", fmt.Errorf("flux source unavailable")
			}
			return "True", nil
		case name == "kubectl" && strings.Contains(command, " apply -k "):
			recorder.record(name, args)
			return "", fmt.Errorf("apply -k failed")
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "git" && strings.Contains(command, "status --porcelain") {
			recorder.record(name, args)
			return "", nil
		}
		if name == "sh" && strings.Contains(command, "kubectl kustomize") {
			recorder.record(name, args)
			return "", fmt.Errorf("render failed")
		}
		return run(ctx, timeout, name, args...)
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, runSensitive)

	err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-fail"})
	if err == nil {
		t.Fatalf("expected startup to fail when all bootstrap paths fail")
	}
	if !strings.Contains(err.Error(), "local bootstrap apply failed for every configured path") {
		t.Fatalf("unexpected startup error: %v", err)
	}
}

// TestLifecycleStartupVaultUsesCachedUnsealKeyFallback runs one orchestration or CLI step.
// Signature: TestLifecycleStartupVaultUsesCachedUnsealKeyFallback(t *testing.T).
// Why: covers vault unseal fallback when secret retrieval fails but cached key exists.
func TestLifecycleStartupVaultUsesCachedUnsealKeyFallback(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	if err := os.WriteFile(cfg.Startup.VaultUnsealKeyFile, []byte("cached-unseal-key\n"), 0o600); err != nil {
		t.Fatalf("write cached key: %v", err)
	}

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	vaultReadyChecks := 0
	vaultSealedChecks := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
			recorder.record(name, args)
			return "Running", nil
		case name == "kubectl" && strings.Contains(command, "-n vault get statefulset vault -o jsonpath={.status.readyReplicas}"):
			recorder.record(name, args)
			vaultReadyChecks++
			if vaultReadyChecks <= 2 {
				return "0", nil
			}
			return "1", nil
		case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
			recorder.record(name, args)
			vaultSealedChecks++
			if vaultSealedChecks <= 1 {
				return `{"sealed":true,"initialized":true}`, nil
			}
			return `{"sealed":false,"initialized":true}`, nil
		case name == "kubectl" && strings.Contains(command, "-n vault get secret vault-init -o jsonpath={.data.unseal_key_b64}"):
			recorder.record(name, args)
			return "", fmt.Errorf("secret not found")
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	runSensitive := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "kubectl" && strings.Contains(command, "vault operator unseal cached-unseal-key") {
			recorder.record(name, args)
			return "ok", nil
		}
		return run(ctx, timeout, name, args...)
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, runSensitive)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "vault-cached-key"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if !recorder.contains("vault operator unseal cached-unseal-key") {
		t.Fatalf("expected cached key unseal command in call log")
	}
}

// TestLifecycleStartupIngressChecklistTimeout runs one orchestration or CLI step.
// Signature: TestLifecycleStartupIngressChecklistTimeout(t *testing.T).
// Why: covers ingress checklist timeout and host-derived auto-heal attempt branches.
func TestLifecycleStartupIngressChecklistTimeout(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireIngressChecklist = true
	cfg.Startup.IngressChecklistWaitSeconds = 1
	cfg.Startup.IngressChecklistPollSeconds = 1
	cfg.Startup.IngressChecklistInsecureSkip = true
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"):
			recorder.record(name, args)
			return `{"items":[{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"rules":[{"host":"127.0.0.1"}]}}]}`, nil
		case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"):
			recorder.record(name, args)
			return `{"items":[]}`, nil
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "ingress-timeout"})
	if err == nil {
		t.Fatalf("expected startup to fail when ingress checklist cannot pass")
	}
	if !strings.Contains(err.Error(), "ingress checklist not satisfied") {
		t.Fatalf("unexpected startup error: %v", err)
	}
}

// TestLifecycleStartupRepairsHostKeyAndRetriesSSHAuth runs one orchestration or CLI step.
// Signature: TestLifecycleStartupRepairsHostKeyAndRetriesSSHAuth(t *testing.T).
// Why: covers host-key mismatch retry logic inside SSH auth gate during startup.
func TestLifecycleStartupRepairsHostKeyAndRetriesSSHAuth(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireNodeSSHAuth = true
	cfg.Startup.NodeSSHAuthWaitSeconds = 2
	cfg.Startup.NodeSSHAuthPollSeconds = 1
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	repairAttempts := 0
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23") {
			recorder.record(name, args)
			repairAttempts++
			if repairAttempts == 1 {
				return "Host key verification failed", fmt.Errorf("exit status 255")
			}
			return "__ANANKE_SSH_AUTH_OK__", nil
		}
		return base(ctx, timeout, name, args...)
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)

	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "ssh-hostkey-repair"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if repairAttempts < 2 {
		t.Fatalf("expected ssh auth retry after host key mismatch, attempts=%d", repairAttempts)
	}
}