ananke/testing/orchestrator/drill_lifecycle_branch_saturation_test.go

package orchestrator

import (
	"context"
	"errors"
	"io"
	"log"
	"net"
	"os"
	"strconv"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// newLifecycleSaturationOrchestrator runs one orchestration or CLI step.
// Signature: newLifecycleSaturationOrchestrator(t *testing.T, cfg config.Config, run commandOverride) *cluster.Orchestrator.
// Why: lifecycle branch saturation needs deterministic command behavior while preserving real intent/lock/file semantics.
func newLifecycleSaturationOrchestrator(
	t *testing.T,
	cfg config.Config,
	run func(context.Context, time.Duration, string, ...string) (string, error),
) *cluster.Orchestrator {
	t.Helper()
	if run == nil {
		run = lifecycleDispatcher(&commandRecorder{})
	}
	orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	orch.SetCommandOverrides(run, run)
	return orch
}

// TestLifecycleStartupBranchSaturation runs one orchestration or CLI step.
// Signature: TestLifecycleStartupBranchSaturation(t *testing.T).
// Why: drives startup through its main error/safety branches so lifecycle coverage
// reflects realistic drill failure modes.
func TestLifecycleStartupBranchSaturation(t *testing.T) {
	t.Run("read-intent-error-branch", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentNormal,
			Reason:    "seed",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("seed intent: %v", err)
		}
		// Replace intent file with directory so ReadIntent fails.
		if err := osRemove(cfg.State.IntentPath); err != nil {
			t.Fatalf("remove intent file: %v", err)
		}
		if err := osMkdir(cfg.State.IntentPath); err != nil {
			t.Fatalf("make intent dir: %v", err)
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
		if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-read-error"}); err == nil {
			t.Fatalf("expected startup to fail when intent path is unreadable")
		}
	})

	t.Run("fresh-shutdown-intent-blocks", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentShuttingDown,
			Reason:    "active-shutdown",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("write shutdown intent: %v", err)
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "blocked-by-shutdown"})
		if err == nil || !strings.Contains(err.Error(), "startup blocked: shutdown intent is active") {
			t.Fatalf("expected active shutdown intent block, got %v", err)
		}
	})

	t.Run("cooldown-cancel-branch", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.ShutdownCooldownSeconds = 20
		if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
			State:     state.IntentShutdownComplete,
			Reason:    "just-finished",
			Source:    "test",
			UpdatedAt: time.Now().UTC(),
		}); err != nil {
			t.Fatalf("write cooldown intent: %v", err)
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
		ctx, cancel := context.WithCancel(context.Background())
		go func() {
			time.Sleep(20 * time.Millisecond)
			cancel()
		}()
		err := orch.Startup(ctx, cluster.StartupOptions{Reason: "cooldown-cancel"})
		if err == nil || !strings.Contains(err.Error(), "startup canceled while waiting for shutdown cooldown") {
			t.Fatalf("expected cooldown cancel branch, got %v", err)
		}
	})

	t.Run("api-failure-without-restore", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.AutoEtcdRestoreOnAPIFailure = false
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
				return "", errors.New("apiserver down")
			}
			return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-fail-no-restore"})
		if err == nil || !strings.Contains(err.Error(), "kubernetes API did not become reachable") {
			t.Fatalf("expected api wait failure, got %v", err)
		}
	})

	t.Run("api-failure-restore-not-applicable-retries", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
		cfg.Startup.EtcdRestoreControlPlane = "titan-db"
		l, err := net.Listen("tcp", "127.0.0.1:0")
		if err != nil {
			t.Fatalf("open local datastore listener: %v", err)
		}
		defer l.Close()
		port := l.Addr().(*net.TCPAddr).Port
		attempt := 0
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
				attempt++
				if attempt <= 1 {
					return "", errors.New("apiserver down")
				}
				return "v1.31.0", nil
			case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:" + strconv.Itoa(port) + "/k3s", nil
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, run)
		if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-restore-not-applicable"}); err != nil {
			t.Fatalf("expected startup success after retry, got %v", err)
		}
	})

	t.Run("bootstrap-required-and-cache-missing-fails", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.IACRepoPath = t.TempDir()
		cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
				return "", errors.New("flux source unavailable")
			case name == "kubectl" && strings.Contains(command, " apply -k "):
				return "", errors.New("apply failed")
			case name == "sh" && strings.Contains(command, "kubectl kustomize"):
				return "", errors.New("fallback failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, run)
		err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-required"})
		if err == nil || !strings.Contains(err.Error(), "local bootstrap apply failed") {
			t.Fatalf("expected bootstrap failure, got %v", err)
		}
	})
}

// TestLifecycleEtcdRestoreAndShutdownBranchSaturation runs one orchestration or CLI step.
// Signature: TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T).
// Why: covers restore/shutdown branch paths that are difficult to hit from a single happy-path drill.
func TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T) {
	t.Run("etcd-restore-input-validation", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.ControlPlanes = nil
		orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
		if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil {
			t.Fatalf("expected restore error with no control planes")
		}
	})

	t.Run("etcd-restore-unmanaged-node", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		cfg.SSHManagedNodes = []string{"titan-23"}
		orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
		if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil {
			t.Fatalf("expected unmanaged control plane restore error")
		}
	})

	t.Run("etcd-restore-command-failure", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server", nil
			case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
				return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
			case name == "ssh" && strings.Contains(command, "stat -c %s"):
				return "2097152", nil
			case name == "ssh" && strings.Contains(command, "sha256sum"):
				return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
			case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
				return "", errors.New("cluster reset failed")
			default:
				return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
			}
		}
		orch := newLifecycleSaturationOrchestrator(t, cfg, run)
		err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"})
		if err == nil || !strings.Contains(err.Error(), "etcd restore command failed") {
			t.Fatalf("expected restore command failure branch, got %v", err)
		}
	})

	t.Run("shutdown-invalid-mode", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
		if err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "bad-mode", Mode: "invalid"}); err == nil {
			t.Fatalf("expected shutdown mode validation error")
		}
	})
}

// osRemove runs one orchestration or CLI step.
// Signature: osRemove(path string) error.
// Why: keeps error handling explicit in lifecycle branch tests without repeated ignore logic.
func osRemove(path string) error {
	return os.Remove(path)
}

// osMkdir runs one orchestration or CLI step.
// Signature: osMkdir(path string) error.
// Why: keeps branch setup concise in lifecycle branch tests.
func osMkdir(path string) error {
	return os.Mkdir(path, 0o755)
}
build: reconcile split modules and restore clean checkout integrity 2026-04-08 23:52:29 -03:00			`package orchestrator`

			`import (`
			`"context"`
			`"errors"`
			`"io"`
			`"log"`
			`"net"`
			`"os"`
			`"strconv"`
			`"strings"`
			`"testing"`
			`"time"`

			`"scm.bstein.dev/bstein/ananke/internal/cluster"`
			`"scm.bstein.dev/bstein/ananke/internal/config"`
			`"scm.bstein.dev/bstein/ananke/internal/execx"`
			`"scm.bstein.dev/bstein/ananke/internal/state"`
			`)`

			`// newLifecycleSaturationOrchestrator runs one orchestration or CLI step.`
			`// Signature: newLifecycleSaturationOrchestrator(t testing.T, cfg config.Config, run commandOverride) cluster.Orchestrator.`
			`// Why: lifecycle branch saturation needs deterministic command behavior while preserving real intent/lock/file semantics.`
			`func newLifecycleSaturationOrchestrator(`
			`t *testing.T,`
			`cfg config.Config,`
			`run func(context.Context, time.Duration, string, ...string) (string, error),`
			`) *cluster.Orchestrator {`
			`t.Helper()`
			`if run == nil {`
			`run = lifecycleDispatcher(&commandRecorder{})`
			`}`
			`orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))`
			`orch.SetCommandOverrides(run, run)`
			`return orch`
			`}`

			`// TestLifecycleStartupBranchSaturation runs one orchestration or CLI step.`
			`// Signature: TestLifecycleStartupBranchSaturation(t *testing.T).`
			`// Why: drives startup through its main error/safety branches so lifecycle coverage`
			`// reflects realistic drill failure modes.`
			`func TestLifecycleStartupBranchSaturation(t *testing.T) {`
			`t.Run("read-intent-error-branch", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{`
			`State: state.IntentNormal,`
			`Reason: "seed",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC(),`
			`}); err != nil {`
			`t.Fatalf("seed intent: %v", err)`
			`}`
			`// Replace intent file with directory so ReadIntent fails.`
			`if err := osRemove(cfg.State.IntentPath); err != nil {`
			`t.Fatalf("remove intent file: %v", err)`
			`}`
			`if err := osMkdir(cfg.State.IntentPath); err != nil {`
			`t.Fatalf("make intent dir: %v", err)`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, nil)`
			`if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-read-error"}); err == nil {`
			`t.Fatalf("expected startup to fail when intent path is unreadable")`
			`}`
			`})`

			`t.Run("fresh-shutdown-intent-blocks", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{`
			`State: state.IntentShuttingDown,`
			`Reason: "active-shutdown",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC(),`
			`}); err != nil {`
			`t.Fatalf("write shutdown intent: %v", err)`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, nil)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "blocked-by-shutdown"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "startup blocked: shutdown intent is active") {`
			`t.Fatalf("expected active shutdown intent block, got %v", err)`
			`}`
			`})`

			`t.Run("cooldown-cancel-branch", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.Startup.ShutdownCooldownSeconds = 20`
			`if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{`
			`State: state.IntentShutdownComplete,`
			`Reason: "just-finished",`
			`Source: "test",`
			`UpdatedAt: time.Now().UTC(),`
			`}); err != nil {`
			`t.Fatalf("write cooldown intent: %v", err)`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, nil)`
			`ctx, cancel := context.WithCancel(context.Background())`
			`go func() {`
			`time.Sleep(20 * time.Millisecond)`
			`cancel()`
			`}()`
			`err := orch.Startup(ctx, cluster.StartupOptions{Reason: "cooldown-cancel"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "startup canceled while waiting for shutdown cooldown") {`
			`t.Fatalf("expected cooldown cancel branch, got %v", err)`
			`}`
			`})`

			`t.Run("api-failure-without-restore", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.Startup.AutoEtcdRestoreOnAPIFailure = false`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {`
			`return "", errors.New("apiserver down")`
			`}`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-fail-no-restore"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "kubernetes API did not become reachable") {`
			`t.Fatalf("expected api wait failure, got %v", err)`
			`}`
			`})`

			`t.Run("api-failure-restore-not-applicable-retries", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.Startup.AutoEtcdRestoreOnAPIFailure = true`
			`cfg.Startup.EtcdRestoreControlPlane = "titan-db"`
			`l, err := net.Listen("tcp", "127.0.0.1:0")`
			`if err != nil {`
			`t.Fatalf("open local datastore listener: %v", err)`
			`}`
			`defer l.Close()`
			`port := l.Addr().(*net.TCPAddr).Port`
			`attempt := 0`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):`
			`attempt++`
			`if attempt <= 1 {`
			`return "", errors.New("apiserver down")`
			`}`
			`return "v1.31.0", nil`
			`case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):`
			`return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:" + strconv.Itoa(port) + "/k3s", nil`
			`default:`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, run)`
			`if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-restore-not-applicable"}); err != nil {`
			`t.Fatalf("expected startup success after retry, got %v", err)`
			`}`
			`})`

			`t.Run("bootstrap-required-and-cache-missing-fails", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.IACRepoPath = t.TempDir()`
			`cfg.LocalBootstrapPaths = []string{"services/bootstrap"}`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):`
			`return "", errors.New("flux source unavailable")`
			`case name == "kubectl" && strings.Contains(command, " apply -k "):`
			`return "", errors.New("apply failed")`
			`case name == "sh" && strings.Contains(command, "kubectl kustomize"):`
			`return "", errors.New("fallback failed")`
			`default:`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, run)`
			`err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-required"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "local bootstrap apply failed") {`
			`t.Fatalf("expected bootstrap failure, got %v", err)`
			`}`
			`})`
			`}`

			`// TestLifecycleEtcdRestoreAndShutdownBranchSaturation runs one orchestration or CLI step.`
			`// Signature: TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T).`
			`// Why: covers restore/shutdown branch paths that are difficult to hit from a single happy-path drill.`
			`func TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T) {`
			`t.Run("etcd-restore-input-validation", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.ControlPlanes = nil`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, nil)`
			`if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil {`
			`t.Fatalf("expected restore error with no control planes")`
			`}`
			`})`

			`t.Run("etcd-restore-unmanaged-node", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`cfg.SSHManagedNodes = []string{"titan-23"}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, nil)`
			`if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil {`
			`t.Fatalf("expected unmanaged control plane restore error")`
			`}`
			`})`

			`t.Run("etcd-restore-command-failure", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {`
			`command := name + " " + strings.Join(args, " ")`
			`switch {`
			`case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):`
			`return "ExecStart=/usr/local/bin/k3s server", nil`
			`case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):`
			`return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil`
			`case name == "ssh" && strings.Contains(command, "stat -c %s"):`
			`return "2097152", nil`
			`case name == "ssh" && strings.Contains(command, "sha256sum"):`
			`return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil`
			`case name == "ssh" && strings.Contains(command, "server --cluster-reset"):`
			`return "", errors.New("cluster reset failed")`
			`default:`
			`return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)`
			`}`
			`}`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, run)`
			`err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"})`
			`if err == nil \|\| !strings.Contains(err.Error(), "etcd restore command failed") {`
			`t.Fatalf("expected restore command failure branch, got %v", err)`
			`}`
			`})`

			`t.Run("shutdown-invalid-mode", func(t *testing.T) {`
			`cfg := lifecycleConfig(t)`
			`orch := newLifecycleSaturationOrchestrator(t, cfg, nil)`
			`if err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "bad-mode", Mode: "invalid"}); err == nil {`
			`t.Fatalf("expected shutdown mode validation error")`
			`}`
			`})`
			`}`

			`// osRemove runs one orchestration or CLI step.`
			`// Signature: osRemove(path string) error.`
			`// Why: keeps error handling explicit in lifecycle branch tests without repeated ignore logic.`
			`func osRemove(path string) error {`
			`return os.Remove(path)`
			`}`

			`// osMkdir runs one orchestration or CLI step.`
			`// Signature: osMkdir(path string) error.`
			`// Why: keeps branch setup concise in lifecycle branch tests.`
			`func osMkdir(path string) error {`
			`return os.Mkdir(path, 0o755)`
			`}`