ananke/testing/orchestrator/drill_lifecycle_test.go

package orchestrator

import (
	"context"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

type commandRecorder struct {
	mu    sync.Mutex
	calls []string
}

// record runs one orchestration or CLI step.
// Signature: (r *commandRecorder) record(name string, args []string).
// Why: keeps startup/shutdown integration tests deterministic and auditable.
func (r *commandRecorder) record(name string, args []string) {
	r.mu.Lock()
	defer r.mu.Unlock()
	r.calls = append(r.calls, name+" "+strings.Join(args, " "))
}

// contains runs one orchestration or CLI step.
// Signature: (r *commandRecorder) contains(part string) bool.
// Why: validates that the orchestration flow reached the intended checklist gates.
func (r *commandRecorder) contains(part string) bool {
	r.mu.Lock()
	defer r.mu.Unlock()
	for _, call := range r.calls {
		if strings.Contains(call, part) {
			return true
		}
	}
	return false
}

// lifecycleConfig runs one orchestration or CLI step.
// Signature: lifecycleConfig(t *testing.T) config.Config.
// Why: centralizes a realistic startup/shutdown config used by top-level drill tests.
func lifecycleConfig(t *testing.T) config.Config {
	t.Helper()
	root := t.TempDir()
	return config.Config{
		SSHUser:            "atlas",
		SSHPort:            22,
		ExpectedFluxBranch: "main",
		ExpectedFluxSource: "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git",
		ControlPlanes:      []string{"titan-db"},
		Workers:            []string{"titan-23"},
		SSHManagedNodes:    []string{"titan-db", "titan-23"},
		SSHNodeHosts: map[string]string{
			"titan-db": "titan-db",
			"titan-23": "titan-23",
		},
		State: config.State{
			Dir:            root,
			ReportsDir:     filepath.Join(root, "reports"),
			RunHistoryPath: filepath.Join(root, "runs.json"),
			LockPath:       filepath.Join(root, "ananke.lock"),
			IntentPath:     filepath.Join(root, "intent.json"),
		},
		Startup: config.Startup{
			APIWaitSeconds:                  2,
			APIPollSeconds:                  1,
			RequireNodeInventoryReach:       true,
			NodeInventoryReachWaitSeconds:   2,
			NodeInventoryReachPollSeconds:   1,
			RequireNodeSSHAuth:              false,
			RequireStorageReady:             false,
			RequireIngressChecklist:         false,
			RequireServiceChecklist:         false,
			RequireCriticalServiceEndpoints: true,
			CriticalServiceEndpointWaitSec:  2,
			CriticalServiceEndpointPollSec:  1,
			CriticalServiceEndpoints: []string{
				"monitoring/victoria-metrics-single-server",
			},
			RequireFluxHealth:              true,
			FluxHealthWaitSeconds:          2,
			FluxHealthPollSeconds:          1,
			RequireWorkloadConvergence:     true,
			WorkloadConvergenceWaitSeconds: 2,
			WorkloadConvergencePollSeconds: 1,
			ServiceChecklistStabilitySec:   0,
			AutoRecycleStuckPods:           true,
			StuckPodGraceSeconds:           1,
			VaultUnsealKeyFile:             filepath.Join(root, "vault-unseal.key"),
		},
		Shutdown: config.Shutdown{
			DrainParallelism: 1,
			ScaleParallelism: 1,
			SSHParallelism:   1,
		},
	}
}

// lifecycleDispatcher runs one orchestration or CLI step.
// Signature: lifecycleDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error).
// Why: emulates kubectl/ssh/git behavior so top-level tests can exercise full orchestration logic deterministically.
func lifecycleDispatcher(recorder *commandRecorder) func(context.Context, time.Duration, string, ...string) (string, error) {
	return func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
		recorder.record(name, args)
		command := name + " " + strings.Join(args, " ")
		switch name {
		case "ssh":
			switch {
			case strings.Contains(command, "__ANANKE_NODE_REACHABLE__"):
				return "__ANANKE_NODE_REACHABLE__", nil
			case strings.Contains(command, "__ANANKE_SSH_AUTH_OK__"):
				return "__ANANKE_SSH_AUTH_OK__", nil
			case strings.Contains(command, "systemctl cat k3s"):
				return "ExecStart=/usr/local/bin/k3s server", nil
			case strings.Contains(command, "k3s etcd-snapshot"):
				return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
			default:
				return "ok", nil
			}
		case "git":
			if strings.Contains(command, "status --porcelain") {
				return "", nil
			}
			return "", nil
		case "flux":
			return "", nil
		case "sh":
			return "", nil
		case "kubectl":
			switch {
			case strings.Contains(command, "version --request-timeout=5s"):
				return "v1.31.0", nil
			case strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
				return "True", nil
			case strings.Contains(command, "jsonpath={.spec.url}"):
				return "ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git", nil
			case strings.Contains(command, "jsonpath={.spec.ref.branch}"):
				return "main", nil
			case strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
				return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false,"timeout":"30s"},"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"ok"}]}}]}`, nil
			case strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath="):
				return "services\n", nil
			case strings.Contains(command, "get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath="):
				return "monitoring/grafana\n", nil
			case strings.Contains(command, "annotate kustomizations.kustomize.toolkit.fluxcd.io"):
				return "", nil
			case strings.Contains(command, "annotate --all-namespaces helmreleases.helm.toolkit.fluxcd.io"):
				return "", nil
			case strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
				return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil
			case strings.Contains(command, "get deploy,statefulset -A -o json"):
				return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1}},{"kind":"StatefulSet","metadata":{"namespace":"monitoring","name":"victoria-metrics-single-server"},"spec":{"replicas":1}}]}`, nil
			case strings.Contains(command, "get pods -A -o json"):
				return `{"items":[]}`, nil
			case strings.Contains(command, "get deployment -A -o jsonpath="):
				return "monitoring\tgrafana\t1\n", nil
			case strings.Contains(command, "get statefulset -A -o jsonpath="):
				return "monitoring\tvictoria-metrics-single-server\t1\n", nil
			case strings.Contains(command, "jsonpath={.status.readyReplicas}"):
				return "1", nil
			case strings.Contains(command, "get endpoints victoria-metrics-single-server"):
				return "10.42.0.10\n", nil
			case strings.Contains(command, "get jobs -A -o json"):
				return `{"items":[]}`, nil
			case strings.Contains(command, "rollout status"):
				return "rolled out", nil
			case strings.Contains(command, "cordon "):
				return "", nil
			case strings.Contains(command, "drain "):
				return "", nil
			case strings.Contains(command, "uncordon "):
				return "", nil
			case strings.Contains(command, "patch "):
				return "", nil
			case strings.Contains(command, "scale "):
				return "", nil
			default:
				return "", nil
			}
		default:
			return "", nil
		}
	}
}

// newLifecycleOrchestrator runs one orchestration or CLI step.
// Signature: newLifecycleOrchestrator(t *testing.T) (*cluster.Orchestrator, *commandRecorder, config.Config).
// Why: constructs an orchestrator with deterministic command behavior for top-level module tests.
func newLifecycleOrchestrator(t *testing.T) (*cluster.Orchestrator, *commandRecorder, config.Config) {
	t.Helper()
	cfg := lifecycleConfig(t)
	if err := os.MkdirAll(cfg.State.Dir, 0o755); err != nil {
		t.Fatalf("ensure state dir: %v", err)
	}
	runner := &execx.Runner{DryRun: false}
	recorder := &commandRecorder{}
	orch := cluster.New(cfg, runner, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
	dispatch := lifecycleDispatcher(recorder)
	orch.SetCommandOverrides(dispatch, dispatch)
	return orch, recorder, cfg
}

// TestLifecycleStartupShutdownFromTopLevelModule runs one orchestration or CLI step.
// Signature: TestLifecycleStartupShutdownFromTopLevelModule(t *testing.T).
// Why: validates that a separate top-level testing module can drive full startup
// and shutdown flows, and confirms checklist-oriented steps were actually executed.
func TestLifecycleStartupShutdownFromTopLevelModule(t *testing.T) {
	orch, recorder, cfg := newLifecycleOrchestrator(t)
	ctx := context.Background()

	if err := orch.Startup(ctx, cluster.StartupOptions{Reason: "drill"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	if err := orch.Shutdown(ctx, cluster.ShutdownOptions{
		Reason:           "drill",
		Mode:             "cluster-only",
		SkipEtcdSnapshot: true,
	}); err != nil {
		t.Fatalf("shutdown failed: %v", err)
	}

	if !recorder.contains("get gitrepository flux-system") {
		t.Fatalf("expected flux source guard/read commands to run")
	}
	if !recorder.contains("get endpoints victoria-metrics-single-server") {
		t.Fatalf("expected critical endpoint checklist to run")
	}
	if !recorder.contains("drain titan-23") {
		t.Fatalf("expected worker drain command to run")
	}

	lastStartup := filepath.Join(cfg.State.Dir, "last-startup-report.json")
	if _, err := os.Stat(lastStartup); err != nil {
		t.Fatalf("expected startup report at %s: %v", lastStartup, err)
	}
	lastShutdown := filepath.Join(cfg.State.Dir, "last-shutdown-report.json")
	if _, err := os.Stat(lastShutdown); err != nil {
		t.Fatalf("expected shutdown report at %s: %v", lastShutdown, err)
	}
}

// TestLifecycleStartupFailsWhenFluxSourceDrifts runs one orchestration or CLI step.
// Signature: TestLifecycleStartupFailsWhenFluxSourceDrifts(t *testing.T).
// Why: verifies startup blocks on flux source URL drift, preventing deadlock-prone bootstrap against the wrong repo.
func TestLifecycleStartupFailsWhenFluxSourceDrifts(t *testing.T) {
	orch, _, cfg := newLifecycleOrchestrator(t)
	orch.SetCommandOverrides(func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.url}") {
			return "ssh://git@scm.bstein.dev:2242/bstein/wrong-repo.git", nil
		}
		if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
			return "v1.31.0", nil
		}
		if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
			return "__ANANKE_NODE_REACHABLE__", nil
		}
		if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
			return "ExecStart=/usr/local/bin/k3s server", nil
		}
		if name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}") {
			return "True", nil
		}
		if name == "kubectl" && strings.Contains(command, "jsonpath={.spec.ref.branch}") {
			return "main", nil
		}
		return "", nil
	}, nil)

	err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "drift-test"})
	if err == nil {
		t.Fatalf("expected startup to fail on flux source drift")
	}
	if !strings.Contains(err.Error(), "flux source url drift") {
		t.Fatalf("unexpected startup error: %v", err)
	}

	intent, readErr := state.ReadIntent(cfg.State.IntentPath)
	if readErr != nil {
		t.Fatalf("read intent after failed startup: %v", readErr)
	}
	if intent.State != state.IntentNormal {
		t.Fatalf("expected intent to be normalized after startup failure, got %q", intent.State)
	}
}

// TestLifecycleShutdownRejectsLegacyPoweroffMode runs one orchestration or CLI step.
// Signature: TestLifecycleShutdownRejectsLegacyPoweroffMode(t *testing.T).
// Why: enforces the safety rule that Ananke never powers off hosts, only performs cluster-only shutdown.
func TestLifecycleShutdownRejectsLegacyPoweroffMode(t *testing.T) {
	orch, _, _ := newLifecycleOrchestrator(t)
	err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{
		Reason: "mode-test",
		Mode:   "poweroff",
	})
	if err == nil {
		t.Fatalf("expected poweroff mode to be rejected")
	}
	if !strings.Contains(err.Error(), "no longer powers off hosts") {
		t.Fatalf("unexpected shutdown mode error: %v", err)
	}
}

// TestLifecycleStatusArtifactsIncludeProgress runs one orchestration or CLI step.
// Signature: TestLifecycleStatusArtifactsIncludeProgress(t *testing.T).
// Why: confirms startup writes progress snapshots so CLI status polling can report checklist progress live.
func TestLifecycleStatusArtifactsIncludeProgress(t *testing.T) {
	orch, _, cfg := newLifecycleOrchestrator(t)
	ctx := context.Background()

	if err := orch.Startup(ctx, cluster.StartupOptions{Reason: "status-test"}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}

	progressPath := filepath.Join(cfg.State.Dir, "startup-progress.json")
	b, err := os.ReadFile(progressPath)
	if err != nil {
		t.Fatalf("read startup progress artifact: %v", err)
	}
	payload := string(b)
	if !strings.Contains(payload, `"status": "success"`) {
		t.Fatalf("expected startup progress to show success, payload=%s", payload)
	}
	if !strings.Contains(payload, `"checks"`) {
		t.Fatalf("expected startup progress payload to include checks map")
	}

	lastReportPath := filepath.Join(cfg.State.Dir, "last-startup-report.json")
	lastReport, err := os.ReadFile(lastReportPath)
	if err != nil {
		t.Fatalf("read last startup report: %v", err)
	}
	if !strings.Contains(string(lastReport), `"phase": "complete"`) {
		t.Fatalf("expected startup report to include final completion phase, got: %s", string(lastReport))
	}
}

// TestLifecycleReasonAppearsInRunHistory runs one orchestration or CLI step.
// Signature: TestLifecycleReasonAppearsInRunHistory(t *testing.T).
// Why: ties operator-supplied reasons to durable history for post-drill analysis.
func TestLifecycleReasonAppearsInRunHistory(t *testing.T) {
	orch, _, cfg := newLifecycleOrchestrator(t)
	reason := fmt.Sprintf("drill-%d", time.Now().UnixNano())
	if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: reason}); err != nil {
		t.Fatalf("startup failed: %v", err)
	}
	records, err := state.New(cfg.State.RunHistoryPath).Load()
	if err != nil {
		t.Fatalf("load run history: %v", err)
	}
	if len(records) == 0 {
		t.Fatalf("expected at least one run history record")
	}
	found := false
	for _, rec := range records {
		if rec.Action == "startup" && rec.Reason == reason && rec.Success {
			found = true
			break
		}
	}
	if !found {
		t.Fatalf("expected startup record with reason %q in run history", reason)
	}
}