ananke/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go

package orchestrator

import (
	"context"
	"errors"
	"net"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// lifecycleFastConfig runs one orchestration or CLI step.
// Signature: lifecycleFastConfig(t *testing.T) config.Config.
// Why: lifecycle gap tests intentionally fail early in many branches, so short
// wait windows keep branch coverage runs fast and deterministic.
func lifecycleFastConfig(t *testing.T) config.Config {
	t.Helper()
	cfg := lifecycleConfig(t)
	cfg.Startup.APIWaitSeconds = 1
	cfg.Startup.APIPollSeconds = 1
	cfg.Startup.NodeInventoryReachWaitSeconds = 1
	cfg.Startup.NodeInventoryReachPollSeconds = 1
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	cfg.Startup.RequireIngressChecklist = false
	cfg.Startup.RequireServiceChecklist = false
	cfg.Startup.ServiceChecklistStabilitySec = 0
	cfg.Startup.RequirePostStartProbes = false
	return cfg
}

// TestLifecycleDeepFailureMatrix runs one orchestration or CLI step.
// Signature: TestLifecycleDeepFailureMatrix(t *testing.T).
// Why: saturates remaining lifecycle startup/shutdown edge branches that are
// difficult to hit from happy-path drill tests.
func TestLifecycleDeepFailureMatrix(t *testing.T) {
	t.Run("startup-lock-path-is-directory", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.State.LockPath = t.TempDir()
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lock-dir"}); err == nil {
			t.Fatalf("expected lock-path directory failure")
		}
	})

	t.Run("startup-node-inventory-validation-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.SSHPort = 70000
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bad-inventory"})
		if err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") {
			t.Fatalf("expected node inventory preflight failure, got %v", err)
		}
	})

	t.Run("startup-node-reachability-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
				return "", errors.New("no route to host")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "reachability-fail"})
		if err == nil || !strings.Contains(err.Error(), "node inventory reachability gate") {
			t.Fatalf("expected reachability gate failure, got %v", err)
		}
	})

	t.Run("startup-clear-stale-startup-intent-write-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentStartupInProgress,
			Reason:    "stale",
			Source:    "test",
			UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
		}); err != nil {
			t.Fatalf("seed stale startup intent: %v", err)
		}
		restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
			if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") {
				return errors.New("forced intent clear failure")
			}
			return state.TestHookWriteIntentDefault(path, in)
		})
		t.Cleanup(restoreWrite)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"})
		if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") {
			t.Fatalf("expected stale startup clear failure, got %v", err)
		}
	})

	t.Run("startup-clear-stale-shutdown-intent-write-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentShuttingDown,
			Reason:    "stale-shutdown",
			Source:    "test",
			UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
		}); err != nil {
			t.Fatalf("seed stale shutdown intent: %v", err)
		}
		restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
			if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") {
				return errors.New("forced intent clear failure")
			}
			return state.TestHookWriteIntentDefault(path, in)
		})
		t.Cleanup(restoreWrite)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"})
		if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") {
			t.Fatalf("expected stale shutdown clear failure, got %v", err)
		}
	})

	t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireNodeInventoryReach = false
		cfg.Startup.ShutdownCooldownSeconds = 5
		reads := 0
		restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
			if path != cfg.State.IntentPath {
				return state.TestHookReadIntentDefault(path)
			}
			reads++
			if reads == 1 {
				return state.Intent{
					State:     state.IntentShutdownComplete,
					Reason:    "recent",
					Source:    "test",
					UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
				}, nil
			}
			return state.Intent{}, errors.New("forced reread failure")
		})
		t.Cleanup(restoreRead)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
		if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
			t.Fatalf("expected cooldown reread failure, got %v", err)
		}
	})

	t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireNodeInventoryReach = false
		cfg.Startup.ShutdownCooldownSeconds = 5
		reads := 0
		restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
			if path != cfg.State.IntentPath {
				return state.TestHookReadIntentDefault(path)
			}
			reads++
			if reads == 1 {
				return state.Intent{
					State:     state.IntentShutdownComplete,
					Reason:    "recent",
					Source:    "test",
					UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
				}, nil
			}
			return state.Intent{
				State:     state.IntentShuttingDown,
				Reason:    "peer-shutdown",
				Source:    "test",
				UpdatedAt: time.Now().UTC(),
			}, nil
		})
		t.Cleanup(restoreRead)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
		if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
			t.Fatalf("expected cooldown active-shutdown failure, got %v", err)
		}
	})

	t.Run("startup-set-intent-write-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentNormal,
			Reason:    "seed",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("seed normal intent: %v", err)
		}
		restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
			if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress {
				return errors.New("forced startup intent write failure")
			}
			return state.TestHookWriteIntentDefault(path, in)
		})
		t.Cleanup(restoreWrite)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"})
		if err == nil || !strings.Contains(err.Error(), "set startup intent") {
			t.Fatalf("expected startup intent write failure, got %v", err)
		}
	})

	t.Run("startup-timesync-error-propagates", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireTimeSync = true
		cfg.Startup.TimeSyncWaitSeconds = 1
		cfg.Startup.TimeSyncPollSeconds = 1
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-fail"})
		if err == nil || !strings.Contains(err.Error(), "time sync") {
			t.Fatalf("expected time sync failure, got %v", err)
		}
	})

	t.Run("startup-datastore-preflight-cancel", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
				return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		ctx, cancel := context.WithCancel(context.Background())
		cancel()
		err := orch.Startup(ctx, cluster.StartupOptions{Reason: "datastore-cancel"})
		if !errors.Is(err, context.Canceled) {
			t.Fatalf("expected canceled datastore preflight, got %v", err)
		}
	})

	t.Run("startup-auto-etcd-restore-hard-failure", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
		cfg.Startup.EtcdRestoreControlPlane = "titan-db"
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
				return "", errors.New("api down")
			case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server", nil
			case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
				return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
			case name == "ssh" && strings.Contains(command, "stat -c %s"):
				return "2097152", nil
			case name == "ssh" && strings.Contains(command, "sha256sum"):
				return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
			case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
				return "", errors.New("cluster reset failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-hard-fail"})
		if err == nil || !strings.Contains(err.Error(), "automatic etcd restore failed") {
			t.Fatalf("expected automatic etcd restore failure, got %v", err)
		}
	})

	t.Run("startup-auto-etcd-restore-not-applicable-then-api-still-down", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
		cfg.Startup.EtcdRestoreControlPlane = "titan-db"
		listener, err := net.Listen("tcp", "127.0.0.1:0")
		if err != nil {
			t.Fatalf("open local datastore listener: %v", err)
		}
		defer listener.Close()
		address := listener.Addr().String()
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
				return "", errors.New("api still down")
			case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://" + address + "/k3s", nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-not-applicable"})
		if err == nil || !strings.Contains(err.Error(), "after automatic etcd restore") {
			t.Fatalf("expected api failure after not-applicable restore path, got %v", err)
		}
	})

	t.Run("startup-required-node-labels-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
			"titan-db": {"topology.kubernetes.io/zone": "lab-a"},
		}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "label node titan-db --overwrite") {
				return "", errors.New("label denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "label-fail"})
		if err == nil || !strings.Contains(err.Error(), "ensure required node labels") {
			t.Fatalf("expected required-node-label failure, got %v", err)
		}
	})

	t.Run("startup-worker-discovery-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Workers = nil
		cfg.SSHManagedNodes = []string{"titan-db"}
		cfg.SSHNodeHosts = map[string]string{
			"titan-db": "titan-db",
		}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" {
				return "", errors.New("nodes denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "worker-discovery-fail"})
		if err == nil || !strings.Contains(err.Error(), "discover workers") {
			t.Fatalf("expected worker-discovery failure, got %v", err)
		}
	})

	t.Run("startup-storage-readiness-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireStorageReady = true
		cfg.Startup.StorageReadyWaitSeconds = 1
		cfg.Startup.StorageReadyPollSeconds = 1
		cfg.Startup.StorageMinReadyNodes = 1
		cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
				return "a:True:True\n", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "storage-fail"})
		if err == nil || !strings.Contains(err.Error(), "invalid storage_critical_pvcs entry") {
			t.Fatalf("expected storage readiness failure, got %v", err)
		}
	})

	t.Run("startup-critical-workload-scale-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "scale deployment source-controller") {
				return "", errors.New("scale denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "critical-scale-fail"})
		if err == nil || !strings.Contains(err.Error(), "scale flux-system/deployment/source-controller") {
			t.Fatalf("expected critical-workload scale failure, got %v", err)
		}
	})

	t.Run("startup-flux-resume-failure", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" &&
				strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io") &&
				strings.Contains(command, "jsonpath={range .items[*]}{.metadata.name}") {
				return "", errors.New("kustomization list denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "resume-fail"})
		if err == nil || !strings.Contains(err.Error(), "kustomization list denied") {
			t.Fatalf("expected flux-resume failure, got %v", err)
		}
	})
}
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`package orchestrator`

			`import (`
			`"context"`
			`"errors"`
			`"net"`
			`"strings"`
			`"testing"`
			`"time"`

			`"scm.bstein.dev/bstein/ananke/internal/cluster"`
			`"scm.bstein.dev/bstein/ananke/internal/config"`
			`"scm.bstein.dev/bstein/ananke/internal/state"`
			`)`

			`// lifecycleFastConfig runs one orchestration or CLI step.`
			`// Signature: lifecycleFastConfig(t *testing.T) config.Config.`
			`// Why: lifecycle gap tests intentionally fail early in many branches, so short`
			`// wait windows keep branch coverage runs fast and deterministic.`
			`func lifecycleFastConfig(t *testing.T) config.Config {`
			`t.Helper()`
			`cfg := lifecycleConfig(t)`
			`cfg.Startup.APIWaitSeconds = 1`
			`cfg.Startup.APIPollSeconds = 1`
			`cfg.Startup.NodeInventoryReachWaitSeconds = 1`
			`cfg.Startup.NodeInventoryReachPollSeconds = 1`
			`cfg.Startup.RequireCriticalServiceEndpoints = false`
			`cfg.Startup.RequireFluxHealth = false`
			`cfg.Startup.RequireWorkloadConvergence = false`
			`cfg.Startup.RequireIngressChecklist = false`
			`cfg.Startup.RequireServiceChecklist = false`
			`cfg.Startup.ServiceChecklistStabilitySec = 0`
			`cfg.Startup.RequirePostStartProbes = false`
			`return cfg`
			`}`

			`// TestLifecycleDeepFailureMatrix runs one orchestration or CLI step.`
			`// Signature: TestLifecycleDeepFailureMatrix(t *testing.T).`
			`// Why: saturates remaining lifecycle startup/shutdown edge branches that are`
			`// difficult to hit from happy-path drill tests.`
			`func TestLifecycleDeepFailureMatrix(t *testing.T) {`
			`t.Run("startup-lock-path-is-directory", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.State.LockPath = t.TempDir()`
			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lock-dir"}); err == nil {`
			`t.Fatalf("expected lock-path directory failure")`
			`}`
			`})`

			`t.Run("startup-node-inventory-validation-fails", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.SSHPort = 70000`
			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bad-inventory"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "node inventory preflight failed") {`
			`t.Fatalf("expected node inventory preflight failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-node-reachability-fails", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {`
			`return "", errors.New("no route to host")`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "reachability-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "node inventory reachability gate") {`
			`t.Fatalf("expected reachability gate failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-clear-stale-startup-intent-write-fails", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{`
			`State: state.IntentStartupInProgress,`
			`Reason: "stale",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),`
			`}); err != nil {`
			`t.Fatalf("seed stale startup intent: %v", err)`
			`}`
testing: make quality gate root-safe and deterministic 2026-04-09 04:56:41 -03:00			`restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {`
			`if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") {`
			`return errors.New("forced intent clear failure")`
			`}`
			`return state.TestHookWriteIntentDefault(path, in)`
			`})`
			`t.Cleanup(restoreWrite)`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "clear stale startup intent") {`
			`t.Fatalf("expected stale startup clear failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-clear-stale-shutdown-intent-write-fails", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{`
			`State: state.IntentShuttingDown,`
			`Reason: "stale-shutdown",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),`
			`}); err != nil {`
			`t.Fatalf("seed stale shutdown intent: %v", err)`
			`}`
testing: make quality gate root-safe and deterministic 2026-04-09 04:56:41 -03:00			`restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {`
			`if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") {`
			`return errors.New("forced intent clear failure")`
			`}`
			`return state.TestHookWriteIntentDefault(path, in)`
			`})`
			`t.Cleanup(restoreWrite)`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "clear stale shutdown intent") {`
			`t.Fatalf("expected stale shutdown clear failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
test(ananke): stabilize startup cooldown matrix 2026-04-21 15:41:36 -03:00			`cfg.Startup.RequireNodeInventoryReach = false`
test(ananke): stabilize cooldown lifecycle assertions 2026-04-21 17:03:55 -03:00			`cfg.Startup.ShutdownCooldownSeconds = 5`
test(ananke): make startup cooldown coverage deterministic 2026-04-21 17:33:26 -03:00			`reads := 0`
			`restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {`
			`if path != cfg.State.IntentPath {`
			`return state.TestHookReadIntentDefault(path)`
			`}`
			`reads++`
			`if reads == 1 {`
			`return state.Intent{`
			`State: state.IntentShutdownComplete,`
			`Reason: "recent",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC().Add(-4 * time.Second),`
			`}, nil`
			`}`
			`return state.Intent{}, errors.New("forced reread failure")`
			`})`
			`t.Cleanup(restoreRead)`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {`
			`t.Fatalf("expected cooldown reread failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
test(ananke): stabilize startup cooldown matrix 2026-04-21 15:41:36 -03:00			`cfg.Startup.RequireNodeInventoryReach = false`
test(ananke): stabilize cooldown lifecycle assertions 2026-04-21 17:03:55 -03:00			`cfg.Startup.ShutdownCooldownSeconds = 5`
test(ananke): make startup cooldown coverage deterministic 2026-04-21 17:33:26 -03:00			`reads := 0`
			`restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {`
			`if path != cfg.State.IntentPath {`
			`return state.TestHookReadIntentDefault(path)`
			`}`
			`reads++`
			`if reads == 1 {`
			`return state.Intent{`
			`State: state.IntentShutdownComplete,`
			`Reason: "recent",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC().Add(-4 * time.Second),`
			`}, nil`
			`}`
			`return state.Intent{`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`State: state.IntentShuttingDown,`
			`Reason: "peer-shutdown",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC(),`
test(ananke): make startup cooldown coverage deterministic 2026-04-21 17:33:26 -03:00			`}, nil`
			`})`
			`t.Cleanup(restoreRead)`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {`
			`t.Fatalf("expected cooldown active-shutdown failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-set-intent-write-fails", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{`
			`State: state.IntentNormal,`
			`Reason: "seed",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC(),`
			`}); err != nil {`
			`t.Fatalf("seed normal intent: %v", err)`
			`}`
testing: make quality gate root-safe and deterministic 2026-04-09 04:56:41 -03:00			`restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {`
			`if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress {`
			`return errors.New("forced startup intent write failure")`
			`}`
			`return state.TestHookWriteIntentDefault(path, in)`
			`})`
			`t.Cleanup(restoreWrite)`
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "set startup intent") {`
			`t.Fatalf("expected startup intent write failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-timesync-error-propagates", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.Startup.RequireTimeSync = true`
			`cfg.Startup.TimeSyncWaitSeconds = 1`
			`cfg.Startup.TimeSyncPollSeconds = 1`
			`orch, _ := newHookOrchestrator(t, cfg, nil, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "time sync") {`
			`t.Fatalf("expected time sync failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-datastore-preflight-cancel", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {`
			`return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`ctx, cancel := context.WithCancel(context.Background())`
			`cancel()`
			`err := orch.Startup(ctx, cluster.StartupOptions{Reason: "datastore-cancel"})`
			`if !errors.Is(err, context.Canceled) {`
			`t.Fatalf("expected canceled datastore preflight, got %v", err)`
			`}`
			`})`

			`t.Run("startup-auto-etcd-restore-hard-failure", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.Startup.AutoEtcdRestoreOnAPIFailure = true`
			`cfg.Startup.EtcdRestoreControlPlane = "titan-db"`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):`
			`return "", errors.New("api down")`
			`case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):`
			`return "ExecStart=/usr/local/bin/k3s server", nil`
			`case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):`
			`return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil`
			`case name == "ssh" && strings.Contains(command, "stat -c %s"):`
			`return "2097152", nil`
			`case name == "ssh" && strings.Contains(command, "sha256sum"):`
			`return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil`
			`case name == "ssh" && strings.Contains(command, "server --cluster-reset"):`
			`return "", errors.New("cluster reset failed")`
			`default:`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-hard-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "automatic etcd restore failed") {`
			`t.Fatalf("expected automatic etcd restore failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-auto-etcd-restore-not-applicable-then-api-still-down", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.Startup.AutoEtcdRestoreOnAPIFailure = true`
			`cfg.Startup.EtcdRestoreControlPlane = "titan-db"`
			`listener, err := net.Listen("tcp", "127.0.0.1:0")`
			`if err != nil {`
			`t.Fatalf("open local datastore listener: %v", err)`
			`}`
			`defer listener.Close()`
			`address := listener.Addr().String()`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):`
			`return "", errors.New("api still down")`
			`case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):`
			`return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://" + address + "/k3s", nil`
			`default:`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-not-applicable"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "after automatic etcd restore") {`
			`t.Fatalf("expected api failure after not-applicable restore path, got %v", err)`
			`}`
			`})`

			`t.Run("startup-required-node-labels-error", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.Startup.RequiredNodeLabels = map[string]map[string]string{`
			`"titan-db": {"topology.kubernetes.io/zone": "lab-a"},`
			`}`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "kubectl" && strings.Contains(command, "label node titan-db --overwrite") {`
			`return "", errors.New("label denied")`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "label-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "ensure required node labels") {`
			`t.Fatalf("expected required-node-label failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-worker-discovery-error", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.Workers = nil`
			`cfg.SSHManagedNodes = []string{"titan-db"}`
			`cfg.SSHNodeHosts = map[string]string{`
			`"titan-db": "titan-db",`
			`}`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" {`
			`return "", errors.New("nodes denied")`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "worker-discovery-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "discover workers") {`
			`t.Fatalf("expected worker-discovery failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-storage-readiness-error", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`cfg.Startup.RequireStorageReady = true`
			`cfg.Startup.StorageReadyWaitSeconds = 1`
			`cfg.Startup.StorageReadyPollSeconds = 1`
			`cfg.Startup.StorageMinReadyNodes = 1`
			`cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"}`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {`
			`return "a:True:True\n", nil`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "storage-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "invalid storage_critical_pvcs entry") {`
			`t.Fatalf("expected storage readiness failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-critical-workload-scale-error", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "kubectl" && strings.Contains(command, "scale deployment source-controller") {`
			`return "", errors.New("scale denied")`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "critical-scale-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "scale flux-system/deployment/source-controller") {`
			`t.Fatalf("expected critical-workload scale failure, got %v", err)`
			`}`
			`})`

			`t.Run("startup-flux-resume-failure", func(t *testing.T) {`
			`cfg := lifecycleFastConfig(t)`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "kubectl" &&`
			`strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io") &&`
			`strings.Contains(command, "jsonpath={range .items[*]}{.metadata.name}") {`
			`return "", errors.New("kustomization list denied")`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch, _ := newHookOrchestrator(t, cfg, run, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "resume-fail"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "kustomization list denied") {`
			`t.Fatalf("expected flux-resume failure, got %v", err)`
			`}`
			`})`
			`}`