ananke/cmd/ananke/command_handlers_injection_test.go

package main

import (
	"context"
	"errors"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/service"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestRunStartupDispatchWithInjectedOrchestrator runs one orchestration or CLI step.
// Signature: TestRunStartupDispatchWithInjectedOrchestrator(t *testing.T).
// Why: covers startup command wiring and option propagation without invoking a real cluster flow.
func TestRunStartupDispatchWithInjectedOrchestrator(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	var captured cluster.StartupOptions
	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "coordinator"
		cfg.UPS.Enabled = false
		orch := newTestOrchestrator(cfg, dryRun)
		return cfg, orch, nil
	}
	startupOrchestratorCommand = func(_ context.Context, _ *cluster.Orchestrator, opts cluster.StartupOptions) error {
		captured = opts
		return nil
	}

	logger := log.New(io.Discard, "", 0)
	err := runStartup(logger, []string{
		"--config", "/ignored.yaml",
		"--execute",
		"--force-flux-branch", "main",
		"--skip-local-bootstrap",
		"--reason", "drill-startup",
	})
	if err != nil {
		t.Fatalf("runStartup failed: %v", err)
	}
	if captured.ForceFluxBranch != "main" || !captured.SkipLocalBootstrap || captured.Reason != "drill-startup" {
		t.Fatalf("unexpected startup options: %+v", captured)
	}
}

// TestRunStartupPeerRoleGuards runs one orchestration or CLI step.
// Signature: TestRunStartupPeerRoleGuards(t *testing.T).
// Why: covers peer-role guard, handoff short-circuit, and coordinator guard disallow paths.
func TestRunStartupPeerRoleGuards(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	logger := log.New(io.Discard, "", 0)

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "peer"
		cfg.UPS.Enabled = false
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}

	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute"}); err == nil {
		t.Fatalf("expected peer-role block without override")
	}

	tryPeerBootstrapHandoffCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, error) {
		return true, nil
	}
	calledStartup := false
	startupOrchestratorCommand = func(_ context.Context, _ *cluster.Orchestrator, _ cluster.StartupOptions) error {
		calledStartup = true
		return nil
	}
	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute", "--auto-peer-failover"}); err != nil {
		t.Fatalf("expected handoff short-circuit success, got %v", err)
	}
	if calledStartup {
		t.Fatalf("startup should not run after successful handoff")
	}

	tryPeerBootstrapHandoffCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, error) {
		return false, nil
	}
	coordinatorAllowsPeerFallbackCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, string, error) {
		return false, "still busy", nil
	}
	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute", "--auto-peer-failover"}); err == nil {
		t.Fatalf("expected coordinator guard block")
	}
}

// TestRunStartupPowerSafetyHooks runs one orchestration or CLI step.
// Signature: TestRunStartupPowerSafetyHooks(t *testing.T).
// Why: covers startup UPS target build and startup power safety check error propagation.
func TestRunStartupPowerSafetyHooks(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "coordinator"
		cfg.UPS.Enabled = true
		cfg.Coordination.AllowStartupOnBattery = false
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	buildUPSTargetsCommand = func(_ config.Config) ([]service.Target, error) {
		return []service.Target{{Name: "Pyrphoros", Target: "pyrphoros@localhost"}}, nil
	}
	ensureStartupPowerSafeCommand = func(_ context.Context, _ []service.Target, _ float64) error {
		return errors.New("ups blocked")
	}

	logger := log.New(io.Discard, "", 0)
	err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute"})
	if err == nil || !strings.Contains(err.Error(), "ups blocked") {
		t.Fatalf("expected startup UPS block, got %v", err)
	}
}

// TestRunShutdownAndRestoreDispatchHooks runs one orchestration or CLI step.
// Signature: TestRunShutdownAndRestoreDispatchHooks(t *testing.T).
// Why: covers option wiring for shutdown and etcd restore handlers.
func TestRunShutdownAndRestoreDispatchHooks(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	var gotShutdown cluster.ShutdownOptions
	var gotRestore cluster.EtcdRestoreOptions

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	shutdownOrchestratorCommand = func(_ context.Context, _ *cluster.Orchestrator, opts cluster.ShutdownOptions) error {
		gotShutdown = opts
		return nil
	}
	etcdRestoreOrchestratorCommand = func(_ context.Context, _ *cluster.Orchestrator, opts cluster.EtcdRestoreOptions) error {
		gotRestore = opts
		return nil
	}

	logger := log.New(io.Discard, "", 0)
	if err := runShutdown(logger, []string{"--config", "/ignored.yaml", "--skip-etcd-snapshot", "--skip-drain", "--mode", "cluster-only", "--reason", "drill"}); err != nil {
		t.Fatalf("runShutdown failed: %v", err)
	}
	if !gotShutdown.SkipEtcdSnapshot || !gotShutdown.SkipDrain || gotShutdown.Mode != "cluster-only" || gotShutdown.Reason != "drill" {
		t.Fatalf("unexpected shutdown options: %+v", gotShutdown)
	}
	if err := runEtcdRestore(logger, []string{"--config", "/ignored.yaml", "--control-plane", "titan-0a", "--snapshot", "/tmp/snap"}); err != nil {
		t.Fatalf("runEtcdRestore failed: %v", err)
	}
	if gotRestore.ControlPlane != "titan-0a" || gotRestore.SnapshotPath != "/tmp/snap" {
		t.Fatalf("unexpected etcd restore options: %+v", gotRestore)
	}
}

// TestRunDaemonAndIntentHooks runs one orchestration or CLI step.
// Signature: TestRunDaemonAndIntentHooks(t *testing.T).
// Why: covers daemon context-cancel behavior and injected intent read/write hooks.
func TestRunDaemonAndIntentHooks(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	cfgPath := writeTestConfig(t)
	logger := log.New(io.Discard, "", 0)

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg, err := config.Load(cfgPath)
		if err != nil {
			return config.Config{}, nil, err
		}
		cfg.UPS.Enabled = true
		orch := newTestOrchestrator(cfg, dryRun)
		return cfg, orch, nil
	}
	buildUPSTargetsCommand = func(_ config.Config) ([]service.Target, error) {
		return []service.Target{{Name: "Pyrphoros", Target: "pyrphoros@localhost"}}, nil
	}
	daemonRunCommand = func(_ context.Context, _ *service.Daemon) error {
		return context.Canceled
	}
	if err := runDaemon(logger, []string{"--config", cfgPath}); err != nil {
		t.Fatalf("runDaemon should ignore context cancellation, got %v", err)
	}

	readIntentCommand = func(_ string) (state.Intent, error) {
		return state.Intent{State: state.IntentNormal, Reason: "ok", Source: "unit", UpdatedAt: time.Now().UTC()}, nil
	}
	if err := runIntent(logger, []string{"--config", cfgPath}); err != nil {
		t.Fatalf("runIntent read failed: %v", err)
	}
	writeCalled := false
	writeIntentCommand = func(_ string, stateValue, reason, source string) error {
		writeCalled = true
		if stateValue != state.IntentNormal || reason != "unit" || source != "tester" {
			t.Fatalf("unexpected intent write payload state=%s reason=%s source=%s", stateValue, reason, source)
		}
		return nil
	}
	if err := runIntent(logger, []string{"--config", cfgPath, "--set", state.IntentNormal, "--reason", "unit", "--source", "tester", "--execute"}); err != nil {
		t.Fatalf("runIntent write failed: %v", err)
	}
	if !writeCalled {
		t.Fatalf("expected write intent hook call")
	}
}

// TestRunStatusReportsSnapshotAndJSON runs one orchestration or CLI step.
// Signature: TestRunStatusReportsSnapshotAndJSON(t *testing.T).
// Why: covers runStatus checklist/phase reporting and JSON payload emission from live startup progress.
func TestRunStatusReportsSnapshotAndJSON(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	cfg := minimalHandlerConfig(t)
	cfgPath := writeTestConfig(t)
	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}

	progress := startupStatusSnapshot{
		StartedAt: time.Now().UTC().Add(-30 * time.Second),
		Status:    "running",
		Phase:     "convergence-checks",
		Reason:    "drill",
		Checks: map[string]startupCheckRecord{
			"service-checklist":          {Status: "running", Detail: "waiting"},
			"critical-service-endpoints": {Status: "failed", Detail: "monitoring/victoria-metrics-single-server endpoints=0"},
			"workload-convergence":       {Status: "passed", Detail: "ok"},
			"phase":                      {Status: "running", Detail: "waiting for readiness"},
		},
		AutoHeals: []string{"restored critical endpoint backends: monitoring/statefulset/victoria-metrics-single-server"},
	}
	writeStartupStatusFixture(t, filepath.Join(cfg.State.Dir, "startup-progress.json"), progress)

	var loggerOut strings.Builder
	logger := log.New(&loggerOut, "", 0)
	stdout := captureStdout(t, func() {
		if err := runStatus(logger, []string{"--config", cfgPath, "--json"}); err != nil {
			t.Fatalf("runStatus failed: %v", err)
		}
	})
	if !strings.Contains(loggerOut.String(), "startup_phase=convergence-checks") {
		t.Fatalf("expected startup phase in logger output, got:\n%s", loggerOut.String())
	}
	if !strings.Contains(loggerOut.String(), "startup_failed_check=critical-service-endpoints") {
		t.Fatalf("expected failed checklist output, got:\n%s", loggerOut.String())
	}
	if !strings.Contains(stdout, "\"phase\": \"convergence-checks\"") {
		t.Fatalf("expected json status payload on stdout, got:\n%s", stdout)
	}
}

// TestRunStatusLogsSnapshotReadError runs one orchestration or CLI step.
// Signature: TestRunStatusLogsSnapshotReadError(t *testing.T).
// Why: covers malformed snapshot parsing path without failing status command.
func TestRunStatusLogsSnapshotReadError(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	cfg := minimalHandlerConfig(t)
	cfgPath := writeTestConfig(t)
	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	progressPath := filepath.Join(cfg.State.Dir, "startup-progress.json")
	if err := os.MkdirAll(filepath.Dir(progressPath), 0o755); err != nil {
		t.Fatalf("mkdir state dir: %v", err)
	}
	if err := os.WriteFile(progressPath, []byte("{bad-json"), 0o644); err != nil {
		t.Fatalf("write malformed progress file: %v", err)
	}

	var loggerOut strings.Builder
	logger := log.New(&loggerOut, "", 0)
	if err := runStatus(logger, []string{"--config", cfgPath}); err != nil {
		t.Fatalf("runStatus should continue on snapshot parse error: %v", err)
	}
	if !strings.Contains(loggerOut.String(), "startup_status_read_error=") {
		t.Fatalf("expected snapshot read error log, got:\n%s", loggerOut.String())
	}
}

// TestRunStartupAdditionalBranches runs one orchestration or CLI step.
// Signature: TestRunStartupAdditionalBranches(t *testing.T).
// Why: expands startup handler coverage for guard and UPS target build error branches.
func TestRunStartupAdditionalBranches(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()
	logger := log.New(io.Discard, "", 0)

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "peer"
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	tryPeerBootstrapHandoffCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, error) {
		return false, errors.New("handoff failed")
	}
	coordinatorAllowsPeerFallbackCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, string, error) {
		return false, "", errors.New("guard unavailable")
	}
	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute", "--auto-peer-failover"}); err == nil {
		t.Fatalf("expected startup guard evaluation failure")
	}

	restore = stubCommandHandlerHooks()
	defer restore()
	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "coordinator"
		cfg.UPS.Enabled = true
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	buildUPSTargetsCommand = func(_ config.Config) ([]service.Target, error) {
		return nil, errors.New("bad ups config")
	}
	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute"}); err == nil {
		t.Fatalf("expected UPS target build error")
	}
}

// TestRunStartupPeerFallbackAllowedRunsStartup runs one orchestration or CLI step.
// Signature: TestRunStartupPeerFallbackAllowedRunsStartup(t *testing.T).
// Why: covers peer auto-failover branch where coordinator allows local startup fallback.
func TestRunStartupPeerFallbackAllowedRunsStartup(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "peer"
		cfg.UPS.Enabled = false
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	tryPeerBootstrapHandoffCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, error) {
		return false, nil
	}
	coordinatorAllowsPeerFallbackCommand = func(_ context.Context, _ config.Config, _ *log.Logger) (bool, string, error) {
		return true, "allowed", nil
	}
	startupCalled := false
	startupOrchestratorCommand = func(_ context.Context, _ *cluster.Orchestrator, _ cluster.StartupOptions) error {
		startupCalled = true
		return nil
	}
	logger := log.New(io.Discard, "", 0)
	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute", "--auto-peer-failover"}); err != nil {
		t.Fatalf("expected startup fallback success, got %v", err)
	}
	if !startupCalled {
		t.Fatalf("expected startup fallback execution")
	}
}

// TestRunStartupAllowOnBatterySkipsSafetyCheck runs one orchestration or CLI step.
// Signature: TestRunStartupAllowOnBatterySkipsSafetyCheck(t *testing.T).
// Why: covers startup battery override branch that intentionally bypasses power safety gating.
func TestRunStartupAllowOnBatterySkipsSafetyCheck(t *testing.T) {
	restore := stubCommandHandlerHooks()
	defer restore()

	buildOrchestratorCommand = func(_ *log.Logger, _ string, dryRun bool) (config.Config, *cluster.Orchestrator, error) {
		cfg := minimalHandlerConfig(t)
		cfg.Coordination.Role = "coordinator"
		cfg.UPS.Enabled = true
		cfg.Coordination.AllowStartupOnBattery = false
		return cfg, newTestOrchestrator(cfg, dryRun), nil
	}
	buildUPSTargetsCommand = func(_ config.Config) ([]service.Target, error) {
		return nil, errors.New("should not be called")
	}
	startupOrchestratorCommand = func(_ context.Context, _ *cluster.Orchestrator, _ cluster.StartupOptions) error {
		return nil
	}
	logger := log.New(io.Discard, "", 0)
	if err := runStartup(logger, []string{"--config", "/ignored.yaml", "--execute", "--allow-on-battery"}); err != nil {
		t.Fatalf("expected allow-on-battery startup success, got %v", err)
	}
}

// stubCommandHandlerHooks runs one orchestration or CLI step.
// Signature: stubCommandHandlerHooks() func().
// Why: keeps handler hook overrides isolated to each test to avoid cross-test bleed.
func stubCommandHandlerHooks() func() {
	prevBuild := buildOrchestratorCommand
	prevHandoff := tryPeerBootstrapHandoffCommand
	prevGuard := coordinatorAllowsPeerFallbackCommand
	prevBuildUPS := buildUPSTargetsCommand
	prevPowerSafe := ensureStartupPowerSafeCommand
	prevStartup := startupOrchestratorCommand
	prevShutdown := shutdownOrchestratorCommand
	prevRestore := etcdRestoreOrchestratorCommand
	prevDaemon := daemonRunCommand
	prevReadIntent := readIntentCommand
	prevWriteIntent := writeIntentCommand

	return func() {
		buildOrchestratorCommand = prevBuild
		tryPeerBootstrapHandoffCommand = prevHandoff
		coordinatorAllowsPeerFallbackCommand = prevGuard
		buildUPSTargetsCommand = prevBuildUPS
		ensureStartupPowerSafeCommand = prevPowerSafe
		startupOrchestratorCommand = prevStartup
		shutdownOrchestratorCommand = prevShutdown
		etcdRestoreOrchestratorCommand = prevRestore
		daemonRunCommand = prevDaemon
		readIntentCommand = prevReadIntent
		writeIntentCommand = prevWriteIntent
	}
}

// minimalHandlerConfig runs one orchestration or CLI step.
// Signature: minimalHandlerConfig(t *testing.T) config.Config.
// Why: command handler tests need a stable config baseline without reaching external systems.
func minimalHandlerConfig(t *testing.T) config.Config {
	t.Helper()
	stateDir := t.TempDir()
	return config.Config{
		ControlPlanes: []string{"titan-0a"},
		Workers:       []string{"titan-22"},
		SSHUser:       "atlas",
		SSHPort:       2277,
		Startup: config.Startup{
			MinimumBatteryPercent: 20,
		},
		Shutdown: config.Shutdown{
			DefaultBudgetSeconds: 1380,
			HistoryMinSamples:    3,
			EmergencyBudgetSec:   420,
			EmergencyMinSamples:  3,
		},
		State: config.State{
			Dir:            stateDir,
			ReportsDir:     filepath.Join(stateDir, "reports"),
			RunHistoryPath: filepath.Join(stateDir, "runs.json"),
			LockPath:       filepath.Join(stateDir, "ananke.lock"),
			IntentPath:     filepath.Join(stateDir, "intent.json"),
		},
	}
}

// newTestOrchestrator runs one orchestration or CLI step.
// Signature: newTestOrchestrator(cfg config.Config, dryRun bool) *cluster.Orchestrator.
// Why: command handler tests need a concrete orchestrator pointer for method signatures while hooks intercept execution.
func newTestOrchestrator(cfg config.Config, dryRun bool) *cluster.Orchestrator {
	return cluster.New(
		cfg,
		&execx.Runner{DryRun: dryRun, Logger: log.New(io.Discard, "", 0)},
		state.New(cfg.State.RunHistoryPath),
		log.New(io.Discard, "", 0),
	)
}

// captureStdout runs one orchestration or CLI step.
// Signature: captureStdout(t *testing.T, fn func()) string.
// Why: status command optionally emits JSON to stdout, so tests need deterministic capture.
func captureStdout(t *testing.T, fn func()) string {
	t.Helper()
	orig := os.Stdout
	r, w, err := os.Pipe()
	if err != nil {
		t.Fatalf("pipe: %v", err)
	}
	os.Stdout = w
	done := make(chan string, 1)
	go func() {
		b, _ := io.ReadAll(r)
		done <- string(b)
	}()
	fn()
	_ = w.Close()
	os.Stdout = orig
	return <-done
}