ananke/cmd/ananke/bootstrap_handoff_additional_test.go

package main

import (
	"context"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/config"
)

// writeFakeSSH runs one orchestration or CLI step.
// Signature: writeFakeSSH(t *testing.T, script string) string.
// Why: peer handoff tests need deterministic SSH behavior without external hosts.
func writeFakeSSH(t *testing.T, script string) string {
	t.Helper()
	tmp := t.TempDir()
	sshPath := filepath.Join(tmp, "ssh")
	if err := os.WriteFile(sshPath, []byte(script), 0o755); err != nil {
		t.Fatalf("write fake ssh: %v", err)
	}
	t.Setenv("PATH", tmp+":"+os.Getenv("PATH"))
	return sshPath
}

// TestTryPeerBootstrapHandoffSuccess runs one orchestration or CLI step.
// Signature: TestTryPeerBootstrapHandoffSuccess(t *testing.T).
// Why: covers successful peer bootstrap handoff flow.
func TestTryPeerBootstrapHandoffSuccess(t *testing.T) {
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\necho ok\n")
	cfg := config.Config{
		SSHUser: "atlas",
		Coordination: config.Coordination{
			ForwardShutdownHost: "titan-db",
		},
	}
	ok, err := tryPeerBootstrapHandoff(context.Background(), cfg, log.New(io.Discard, "", 0))
	if err != nil || !ok {
		t.Fatalf("expected successful handoff, got ok=%v err=%v", ok, err)
	}
}

// TestTryPeerBootstrapHandoffRejectsMissingCoordinator runs one orchestration or CLI step.
// Signature: TestTryPeerBootstrapHandoffRejectsMissingCoordinator(t *testing.T).
// Why: covers peer handoff validation when no coordinator host is configured.
func TestTryPeerBootstrapHandoffRejectsMissingCoordinator(t *testing.T) {
	cfg := config.Config{}
	if _, err := tryPeerBootstrapHandoff(context.Background(), cfg, log.New(io.Discard, "", 0)); err == nil {
		t.Fatalf("expected missing coordinator error")
	}
}

// TestTryPeerBootstrapHandoffTimeout runs one orchestration or CLI step.
// Signature: TestTryPeerBootstrapHandoffTimeout(t *testing.T).
// Why: covers timeout branch when coordinator handoff cannot be completed.
func TestTryPeerBootstrapHandoffTimeout(t *testing.T) {
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\necho fail >&2\nexit 1\n")
	cfg := config.Config{
		SSHUser: "atlas",
		Coordination: config.Coordination{
			ForwardShutdownHost: "titan-db",
		},
	}
	ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
	defer cancel()
	ok, err := tryPeerBootstrapHandoff(ctx, cfg, log.New(io.Discard, "", 0))
	if err == nil || ok {
		t.Fatalf("expected timeout failure, got ok=%v err=%v", ok, err)
	}
}

// TestTryPeerBootstrapHandoffUsesMappedHostAndUser runs one orchestration or CLI step.
// Signature: TestTryPeerBootstrapHandoffUsesMappedHostAndUser(t *testing.T).
// Why: covers target/user/jump argument composition for coordinator handoff SSH calls.
func TestTryPeerBootstrapHandoffUsesMappedHostAndUser(t *testing.T) {
	argsFile := filepath.Join(t.TempDir(), "ssh-args.txt")
	t.Setenv("ANANKE_SSH_ARGS_FILE", argsFile)
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\nprintf '%s\\n' \"$@\" > \"$ANANKE_SSH_ARGS_FILE\"\necho ok\n")

	cfg := config.Config{
		SSHUser: "atlas",
		SSHPort: 2222,
		SSHNodeUsers: map[string]string{
			"titan-db": "override-user",
		},
		SSHNodeHosts: map[string]string{
			"titan-db": "10.0.0.55",
		},
		SSHJumpHost: "titan-jh",
		Coordination: config.Coordination{
			ForwardShutdownHost: "titan-db",
		},
	}
	ok, err := tryPeerBootstrapHandoff(context.Background(), cfg, log.New(io.Discard, "", 0))
	if err != nil || !ok {
		t.Fatalf("expected successful handoff, got ok=%v err=%v", ok, err)
	}
	argsRaw, err := os.ReadFile(argsFile)
	if err != nil {
		t.Fatalf("read captured ssh args: %v", err)
	}
	args := string(argsRaw)
	if !strings.Contains(args, "override-user@10.0.0.55") {
		t.Fatalf("expected mapped target in args, got:\n%s", args)
	}
	if !strings.Contains(args, "-p") || !strings.Contains(args, "2222") {
		t.Fatalf("expected port args in ssh invocation, got:\n%s", args)
	}
	if !strings.Contains(args, "-J") || !strings.Contains(args, "titan-jh:2222") {
		t.Fatalf("expected jump host args in ssh invocation, got:\n%s", args)
	}
}

// TestCoordinatorAllowsFallbackStates runs one orchestration or CLI step.
// Signature: TestCoordinatorAllowsFallbackStates(t *testing.T).
// Why: covers coordinator guard logic for bootstrap-active and normal intent states.
func TestCoordinatorAllowsFallbackStates(t *testing.T) {
	// bootstrap active -> disallow fallback
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\necho __ANANKE_BOOTSTRAP_ACTIVE__\necho intent=normal\n")
	cfg := config.Config{
		SSHUser: "atlas",
		Coordination: config.Coordination{
			ForwardShutdownHost:   "titan-db",
			StartupGuardMaxAgeSec: 60,
		},
	}
	allowed, reason, err := coordinatorAllowsPeerFallbackStartup(context.Background(), cfg, log.New(io.Discard, "", 0))
	if err != nil {
		t.Fatalf("guard check failed: %v", err)
	}
	if allowed || !strings.Contains(reason, "bootstrap service is active") {
		t.Fatalf("expected disallow for bootstrap active, got allowed=%v reason=%q", allowed, reason)
	}

	// coordinator normal -> allow fallback
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\necho __ANANKE_BOOTSTRAP_IDLE__\necho intent=normal reason=\"ok\" source=test updated_at=2026-04-07T00:00:00Z\n")
	allowed, reason, err = coordinatorAllowsPeerFallbackStartup(context.Background(), cfg, log.New(io.Discard, "", 0))
	if err != nil {
		t.Fatalf("guard check failed: %v", err)
	}
	if !allowed || !strings.Contains(reason, "intent is normal") {
		t.Fatalf("expected allow for normal intent, got allowed=%v reason=%q", allowed, reason)
	}
}

// TestCoordinatorAllowsFallbackOnUnreachableCoordinator runs one orchestration or CLI step.
// Signature: TestCoordinatorAllowsFallbackOnUnreachableCoordinator(t *testing.T).
// Why: covers fallback-allowed path when coordinator cannot be reached.
func TestCoordinatorAllowsFallbackOnUnreachableCoordinator(t *testing.T) {
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\nexit 255\n")
	cfg := config.Config{
		SSHUser: "atlas",
		Coordination: config.Coordination{
			ForwardShutdownHost: "titan-db",
		},
	}
	allowed, reason, err := coordinatorAllowsPeerFallbackStartup(context.Background(), cfg, log.New(io.Discard, "", 0))
	if err != nil {
		t.Fatalf("expected no hard error, got %v", err)
	}
	if !allowed || !strings.Contains(reason, "coordinator unreachable") {
		t.Fatalf("expected coordinator-unreachable allow, got allowed=%v reason=%q", allowed, reason)
	}
}

// TestCoordinatorAllowsFallbackWhenNoCoordinatorConfigured runs one orchestration or CLI step.
// Signature: TestCoordinatorAllowsFallbackWhenNoCoordinatorConfigured(t *testing.T).
// Why: covers no-coordinator short-circuit path for peer fallback checks.
func TestCoordinatorAllowsFallbackWhenNoCoordinatorConfigured(t *testing.T) {
	cfg := config.Config{}
	allowed, reason, err := coordinatorAllowsPeerFallbackStartup(context.Background(), cfg, log.New(io.Discard, "", 0))
	if err != nil {
		t.Fatalf("expected no error, got %v", err)
	}
	if !allowed || !strings.Contains(reason, "no coordinator configured") {
		t.Fatalf("unexpected guard result allowed=%v reason=%q", allowed, reason)
	}
}

// TestCoordinatorAllowsFallbackIntentStateMatrix runs one orchestration or CLI step.
// Signature: TestCoordinatorAllowsFallbackIntentStateMatrix(t *testing.T).
// Why: covers peer fallback guard behavior for each coordinator intent branch.
func TestCoordinatorAllowsFallbackIntentStateMatrix(t *testing.T) {
	now := time.Now().UTC()
	stale := now.Add(-20 * time.Minute).Format(time.RFC3339)
	fresh := now.Format(time.RFC3339)
	oldComplete := now.Add(-5 * time.Minute).Format(time.RFC3339)

	cases := []struct {
		name          string
		intentLine    string
		wantAllowed   bool
		wantReasonSub string
	}{
		{
			name:          "shutting_down_fresh_blocks",
			intentLine:    intentOutputLine("shutting_down", "fresh", "test", fresh),
			wantAllowed:   false,
			wantReasonSub: "intent=shutting_down",
		},
		{
			name:          "shutting_down_stale_allows",
			intentLine:    intentOutputLine("shutting_down", "stale", "test", stale),
			wantAllowed:   true,
			wantReasonSub: "shutdown intent stale",
		},
		{
			name:          "startup_in_progress_fresh_blocks",
			intentLine:    intentOutputLine("startup_in_progress", "fresh", "test", fresh),
			wantAllowed:   false,
			wantReasonSub: "intent=startup_in_progress",
		},
		{
			name:          "startup_in_progress_stale_allows",
			intentLine:    intentOutputLine("startup_in_progress", "stale", "test", stale),
			wantAllowed:   true,
			wantReasonSub: "startup intent stale",
		},
		{
			name:          "shutdown_complete_without_age_blocks",
			intentLine:    `intent=shutdown_complete reason="unknown" source=test`,
			wantAllowed:   false,
			wantReasonSub: "unknown age",
		},
		{
			name:          "shutdown_complete_recent_blocks",
			intentLine:    intentOutputLine("shutdown_complete", "recent", "test", fresh),
			wantAllowed:   false,
			wantReasonSub: "recently completed shutdown",
		},
		{
			name:          "shutdown_complete_old_allows",
			intentLine:    intentOutputLine("shutdown_complete", "old", "test", oldComplete),
			wantAllowed:   true,
			wantReasonSub: "old enough",
		},
		{
			name:          "unknown_state_blocks",
			intentLine:    intentOutputLine("mystery_state", "unknown", "test", fresh),
			wantAllowed:   false,
			wantReasonSub: "unknown",
		},
	}

	for _, tc := range cases {
		tc := tc
		t.Run(tc.name, func(t *testing.T) {
			writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\necho __ANANKE_BOOTSTRAP_IDLE__\necho '"+tc.intentLine+"'\n")
			cfg := config.Config{
				SSHUser: "atlas",
				Coordination: config.Coordination{
					ForwardShutdownHost:   "titan-db",
					StartupGuardMaxAgeSec: 60,
				},
				Startup: config.Startup{
					ShutdownCooldownSeconds: 45,
				},
			}
			allowed, reason, err := coordinatorAllowsPeerFallbackStartup(context.Background(), cfg, log.New(io.Discard, "", 0))
			if err != nil {
				t.Fatalf("guard check failed: %v", err)
			}
			if allowed != tc.wantAllowed {
				t.Fatalf("unexpected allowed=%v want=%v reason=%q", allowed, tc.wantAllowed, reason)
			}
			if !strings.Contains(strings.ToLower(reason), strings.ToLower(tc.wantReasonSub)) {
				t.Fatalf("unexpected reason=%q want substring %q", reason, tc.wantReasonSub)
			}
		})
	}
}

// TestCoordinatorGuardRejectsUnparseableIntent runs one orchestration or CLI step.
// Signature: TestCoordinatorGuardRejectsUnparseableIntent(t *testing.T).
// Why: covers parse-error path when coordinator output is malformed.
func TestCoordinatorGuardRejectsUnparseableIntent(t *testing.T) {
	writeFakeSSH(t, "#!/usr/bin/env bash\nset -euo pipefail\necho __ANANKE_BOOTSTRAP_IDLE__\necho gibberish-output\n")
	cfg := config.Config{
		SSHUser: "atlas",
		Coordination: config.Coordination{
			ForwardShutdownHost: "titan-db",
		},
	}
	if _, _, err := coordinatorAllowsPeerFallbackStartup(context.Background(), cfg, log.New(io.Discard, "", 0)); err == nil {
		t.Fatalf("expected parse error for malformed coordinator output")
	}
}

// TestResolveSSHPathsPreferExplicitValues runs one orchestration or CLI step.
// Signature: TestResolveSSHPathsPreferExplicitValues(t *testing.T).
// Why: covers explicit ssh config/identity path branches in startup handoff helpers.
func TestResolveSSHPathsPreferExplicitValues(t *testing.T) {
	cfg := config.Config{
		SSHConfigFile:   "/tmp/ssh-config",
		SSHIdentityFile: "/tmp/ssh-key",
	}
	if got := resolveSSHConfigFile(cfg); got != "/tmp/ssh-config" {
		t.Fatalf("unexpected config path: %q", got)
	}
	if got := resolveSSHIdentityFile(cfg); got != "/tmp/ssh-key" {
		t.Fatalf("unexpected identity path: %q", got)
	}
}

// TestResolveSSHPathCandidatesAndCooldownDefaults runs one orchestration or CLI step.
// Signature: TestResolveSSHPathCandidatesAndCooldownDefaults(t *testing.T).
// Why: covers candidate-list fallback branches and startup cooldown default logic.
func TestResolveSSHPathCandidatesAndCooldownDefaults(t *testing.T) {
	tmp := t.TempDir()
	cfgPath := filepath.Join(tmp, "config")
	idPath := filepath.Join(tmp, "id_ed25519")
	if err := os.WriteFile(cfgPath, []byte("host *"), 0o600); err != nil {
		t.Fatalf("write cfg candidate: %v", err)
	}
	if err := os.WriteFile(idPath, []byte("key"), 0o600); err != nil {
		t.Fatalf("write id candidate: %v", err)
	}
	prevCfg := append([]string{}, sshConfigCandidates...)
	prevID := append([]string{}, sshIdentityCandidates...)
	defer func() {
		sshConfigCandidates = prevCfg
		sshIdentityCandidates = prevID
	}()

	sshConfigCandidates = []string{cfgPath}
	sshIdentityCandidates = []string{idPath}
	if got := resolveSSHConfigFile(config.Config{}); got != cfgPath {
		t.Fatalf("expected cfg candidate %q, got %q", cfgPath, got)
	}
	if got := resolveSSHIdentityFile(config.Config{}); got != idPath {
		t.Fatalf("expected identity candidate %q, got %q", idPath, got)
	}

	sshConfigCandidates = []string{filepath.Join(tmp, "missing-config")}
	sshIdentityCandidates = []string{filepath.Join(tmp, "missing-key")}
	if got := resolveSSHConfigFile(config.Config{}); got != "" {
		t.Fatalf("expected empty config fallback, got %q", got)
	}
	if got := resolveSSHIdentityFile(config.Config{}); got != "" {
		t.Fatalf("expected empty identity fallback, got %q", got)
	}

	if got := startupShutdownCooldown(config.Config{}); got != 45*time.Second {
		t.Fatalf("expected default cooldown 45s, got %s", got)
	}
	if got := startupShutdownCooldown(config.Config{Startup: config.Startup{ShutdownCooldownSeconds: 90}}); got != 90*time.Second {
		t.Fatalf("expected configured cooldown 90s, got %s", got)
	}
}

// intentOutputLine runs one orchestration or CLI step.
// Signature: intentOutputLine(stateValue, reason, source, updatedAt string) string.
// Why: keeps guard-branch fixtures concise and readable.
func intentOutputLine(stateValue, reason, source, updatedAt string) string {
	return `intent=` + stateValue + ` reason="` + reason + `" source=` + source + ` updated_at=` + updatedAt
}