ananke/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go

package orchestrator

import (
	"context"
	"errors"
	"net"
	"os"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// lifecycleFastConfig runs one orchestration or CLI step.
// Signature: lifecycleFastConfig(t *testing.T) config.Config.
// Why: lifecycle gap tests intentionally fail early in many branches, so short
// wait windows keep branch coverage runs fast and deterministic.
func lifecycleFastConfig(t *testing.T) config.Config {
	t.Helper()
	cfg := lifecycleConfig(t)
	cfg.Startup.APIWaitSeconds = 1
	cfg.Startup.APIPollSeconds = 1
	cfg.Startup.NodeInventoryReachWaitSeconds = 1
	cfg.Startup.NodeInventoryReachPollSeconds = 1
	cfg.Startup.RequireCriticalServiceEndpoints = false
	cfg.Startup.RequireFluxHealth = false
	cfg.Startup.RequireWorkloadConvergence = false
	cfg.Startup.RequireIngressChecklist = false
	cfg.Startup.RequireServiceChecklist = false
	cfg.Startup.ServiceChecklistStabilitySec = 0
	cfg.Startup.RequirePostStartProbes = false
	return cfg
}

// TestLifecycleDeepFailureMatrix runs one orchestration or CLI step.
// Signature: TestLifecycleDeepFailureMatrix(t *testing.T).
// Why: saturates remaining lifecycle startup/shutdown edge branches that are
// difficult to hit from happy-path drill tests.
func TestLifecycleDeepFailureMatrix(t *testing.T) {
	t.Run("startup-lock-path-is-directory", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.State.LockPath = t.TempDir()
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lock-dir"}); err == nil {
			t.Fatalf("expected lock-path directory failure")
		}
	})

	t.Run("startup-node-inventory-validation-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.SSHPort = 70000
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bad-inventory"})
		if err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") {
			t.Fatalf("expected node inventory preflight failure, got %v", err)
		}
	})

	t.Run("startup-node-reachability-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
				return "", errors.New("no route to host")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "reachability-fail"})
		if err == nil || !strings.Contains(err.Error(), "node inventory reachability gate") {
			t.Fatalf("expected reachability gate failure, got %v", err)
		}
	})

	t.Run("startup-clear-stale-startup-intent-write-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentStartupInProgress,
			Reason:    "stale",
			Source:    "test",
			UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
		}); err != nil {
			t.Fatalf("seed stale startup intent: %v", err)
		}
		restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
			if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") {
				return errors.New("forced intent clear failure")
			}
			return state.TestHookWriteIntentDefault(path, in)
		})
		t.Cleanup(restoreWrite)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"})
		if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") {
			t.Fatalf("expected stale startup clear failure, got %v", err)
		}
	})

	t.Run("startup-clear-stale-shutdown-intent-write-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentShuttingDown,
			Reason:    "stale-shutdown",
			Source:    "test",
			UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
		}); err != nil {
			t.Fatalf("seed stale shutdown intent: %v", err)
		}
		restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
			if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") {
				return errors.New("forced intent clear failure")
			}
			return state.TestHookWriteIntentDefault(path, in)
		})
		t.Cleanup(restoreWrite)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"})
		if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") {
			t.Fatalf("expected stale shutdown clear failure, got %v", err)
		}
	})

	t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireNodeInventoryReach = false
		cfg.Startup.ShutdownCooldownSeconds = 5
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentShutdownComplete,
			Reason:    "recent",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("seed cooldown intent: %v", err)
		}
		go func(intentPath string) {
			time.Sleep(2 * time.Second)
			_ = os.Remove(intentPath)
			_ = os.Mkdir(intentPath, 0o755)
		}(cfg.State.IntentPath)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
		if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
			t.Fatalf("expected cooldown reread failure, got %v", err)
		}
	})

	t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireNodeInventoryReach = false
		cfg.Startup.ShutdownCooldownSeconds = 5
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentShutdownComplete,
			Reason:    "recent",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("seed cooldown intent: %v", err)
		}
		go func(intentPath string) {
			time.Sleep(2 * time.Second)
			_ = state.WriteIntent(intentPath, state.Intent{
				State:     state.IntentShuttingDown,
				Reason:    "peer-shutdown",
				Source:    "test",
				UpdatedAt: time.Now().UTC(),
			})
		}(cfg.State.IntentPath)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
		if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
			t.Fatalf("expected cooldown active-shutdown failure, got %v", err)
		}
	})

	t.Run("startup-set-intent-write-fails", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentNormal,
			Reason:    "seed",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("seed normal intent: %v", err)
		}
		restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
			if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress {
				return errors.New("forced startup intent write failure")
			}
			return state.TestHookWriteIntentDefault(path, in)
		})
		t.Cleanup(restoreWrite)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"})
		if err == nil || !strings.Contains(err.Error(), "set startup intent") {
			t.Fatalf("expected startup intent write failure, got %v", err)
		}
	})

	t.Run("startup-timesync-error-propagates", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireTimeSync = true
		cfg.Startup.TimeSyncWaitSeconds = 1
		cfg.Startup.TimeSyncPollSeconds = 1
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-fail"})
		if err == nil || !strings.Contains(err.Error(), "time sync") {
			t.Fatalf("expected time sync failure, got %v", err)
		}
	})

	t.Run("startup-datastore-preflight-cancel", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
				return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		ctx, cancel := context.WithCancel(context.Background())
		cancel()
		err := orch.Startup(ctx, cluster.StartupOptions{Reason: "datastore-cancel"})
		if !errors.Is(err, context.Canceled) {
			t.Fatalf("expected canceled datastore preflight, got %v", err)
		}
	})

	t.Run("startup-auto-etcd-restore-hard-failure", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
		cfg.Startup.EtcdRestoreControlPlane = "titan-db"
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
				return "", errors.New("api down")
			case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server", nil
			case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
				return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
			case name == "ssh" && strings.Contains(command, "stat -c %s"):
				return "2097152", nil
			case name == "ssh" && strings.Contains(command, "sha256sum"):
				return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
			case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
				return "", errors.New("cluster reset failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-hard-fail"})
		if err == nil || !strings.Contains(err.Error(), "automatic etcd restore failed") {
			t.Fatalf("expected automatic etcd restore failure, got %v", err)
		}
	})

	t.Run("startup-auto-etcd-restore-not-applicable-then-api-still-down", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
		cfg.Startup.EtcdRestoreControlPlane = "titan-db"
		listener, err := net.Listen("tcp", "127.0.0.1:0")
		if err != nil {
			t.Fatalf("open local datastore listener: %v", err)
		}
		defer listener.Close()
		address := listener.Addr().String()
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
				return "", errors.New("api still down")
			case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://" + address + "/k3s", nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-not-applicable"})
		if err == nil || !strings.Contains(err.Error(), "after automatic etcd restore") {
			t.Fatalf("expected api failure after not-applicable restore path, got %v", err)
		}
	})

	t.Run("startup-required-node-labels-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
			"titan-db": {"topology.kubernetes.io/zone": "lab-a"},
		}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "label node titan-db --overwrite") {
				return "", errors.New("label denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "label-fail"})
		if err == nil || !strings.Contains(err.Error(), "ensure required node labels") {
			t.Fatalf("expected required-node-label failure, got %v", err)
		}
	})

	t.Run("startup-worker-discovery-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Workers = nil
		cfg.SSHManagedNodes = []string{"titan-db"}
		cfg.SSHNodeHosts = map[string]string{
			"titan-db": "titan-db",
		}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" {
				return "", errors.New("nodes denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "worker-discovery-fail"})
		if err == nil || !strings.Contains(err.Error(), "discover workers") {
			t.Fatalf("expected worker-discovery failure, got %v", err)
		}
	})

	t.Run("startup-storage-readiness-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		cfg.Startup.RequireStorageReady = true
		cfg.Startup.StorageReadyWaitSeconds = 1
		cfg.Startup.StorageReadyPollSeconds = 1
		cfg.Startup.StorageMinReadyNodes = 1
		cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
				return "a:True:True\n", nil
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "storage-fail"})
		if err == nil || !strings.Contains(err.Error(), "invalid storage_critical_pvcs entry") {
			t.Fatalf("expected storage readiness failure, got %v", err)
		}
	})

	t.Run("startup-critical-workload-scale-error", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "scale deployment source-controller") {
				return "", errors.New("scale denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "critical-scale-fail"})
		if err == nil || !strings.Contains(err.Error(), "scale flux-system/deployment/source-controller") {
			t.Fatalf("expected critical-workload scale failure, got %v", err)
		}
	})

	t.Run("startup-flux-resume-failure", func(t *testing.T) {
		cfg := lifecycleFastConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" &&
				strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io") &&
				strings.Contains(command, "jsonpath={range .items[*]}{.metadata.name}") {
				return "", errors.New("kustomization list denied")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch, _ := newHookOrchestrator(t, cfg, run, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "resume-fail"})
		if err == nil || !strings.Contains(err.Error(), "kustomization list denied") {
			t.Fatalf("expected flux-resume failure, got %v", err)
		}
	})
}