ananke/testing/orchestrator/hooks_gap_matrix_part1_test.go

package orchestrator

import (
	"context"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"
	"testing"
	"time"

	"scm.bstein.dev/bstein/ananke/internal/cluster"
	"scm.bstein.dev/bstein/ananke/internal/config"
	"scm.bstein.dev/bstein/ananke/internal/execx"
	"scm.bstein.dev/bstein/ananke/internal/state"
)

// TestHookScalingStateSnapshotErrorBranches runs one orchestration or CLI step.
// Signature: TestHookScalingStateSnapshotErrorBranches(t *testing.T).
// Why: drives write/read/restore snapshot failures so scale lifecycle coverage
// captures filesystem edge cases seen during recovery drills.
func TestHookScalingStateSnapshotErrorBranches(t *testing.T) {
	t.Run("scale-down fails when state dir is a file", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		statePath := filepath.Join(t.TempDir(), "state-as-file")
		if err := os.WriteFile(statePath, []byte("blocked"), 0o600); err != nil {
			t.Fatalf("write state blocker file: %v", err)
		}
		cfg.State.Dir = statePath
		cfg.State.RunHistoryPath = filepath.Join(filepath.Dir(statePath), "runs.json")
		cfg.State.IntentPath = filepath.Join(filepath.Dir(statePath), "intent.txt")
		cfg.State.LockPath = filepath.Join(filepath.Dir(statePath), "lock")
		recorder := &commandRecorder{}
		dispatch := lifecycleDispatcher(recorder)
		orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
		orch.SetCommandOverrides(dispatch, dispatch)
		err := orch.TestHookScaleDownApps(context.Background())
		if err == nil || !strings.Contains(err.Error(), "ensure state dir") {
			t.Fatalf("expected state-dir write failure, got %v", err)
		}
	})

	t.Run("restore fails on corrupt snapshot and then succeeds", func(t *testing.T) {
		cfg := lifecycleConfig(t)
		orch, _ := newHookOrchestrator(t, cfg, nil, nil)
		snapshotPath := filepath.Join(cfg.State.Dir, "scaled-workloads.json")
		if err := os.WriteFile(snapshotPath, []byte("{bad-json"), 0o600); err != nil {
			t.Fatalf("write corrupt snapshot: %v", err)
		}
		if err := orch.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "decode scaled workload snapshot") {
			t.Fatalf("expected decode failure, got %v", err)
		}

		valid := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}`
		if err := os.WriteFile(snapshotPath, []byte(valid), 0o600); err != nil {
			t.Fatalf("write valid snapshot: %v", err)
		}
		if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil {
			t.Fatalf("restore scaled apps from snapshot: %v", err)
		}
		if _, err := os.Stat(snapshotPath); !os.IsNotExist(err) {
			t.Fatalf("expected restore to remove snapshot, stat err=%v", err)
		}
	})
}

// TestHookAccessGateFailureMatrix runs one orchestration or CLI step.
// Signature: TestHookAccessGateFailureMatrix(t *testing.T).
// Why: ensures node access/auth gating distinguishes auth-denied versus
// transient reachability failures and returns deterministic startup errors.
func TestHookAccessGateFailureMatrix(t *testing.T) {
	cfg := lifecycleConfig(t)
	cfg.Startup.RequireNodeSSHAuth = true
	cfg.Startup.NodeSSHAuthWaitSeconds = 1
	cfg.Startup.NodeSSHAuthPollSeconds = 1
	cfg.Shutdown.SSHParallelism = 2

	recorder := &commandRecorder{}
	base := lifecycleDispatcher(recorder)
	run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		switch {
		case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-db"):
			recorder.record(name, args)
			return "Permission denied (publickey)", fmt.Errorf("permission denied (publickey)")
		case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23"):
			recorder.record(name, args)
			return "", fmt.Errorf("no route to host")
		case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-db"):
			recorder.record(name, args)
			return "", fmt.Errorf("sudo denied")
		default:
			return base(ctx, timeout, name, args...)
		}
	}
	orch, _ := newHookOrchestrator(t, cfg, run, run)

	if err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "access validation had") {
		t.Fatalf("expected access validation failure, got %v", err)
	}
	if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") {
		t.Fatalf("expected ssh auth gate failure, got %v", err)
	}

	pendingCfg := lifecycleConfig(t)
	pendingCfg.Startup.RequireNodeSSHAuth = true
	pendingCfg.Startup.NodeSSHAuthWaitSeconds = 1
	pendingCfg.Startup.NodeSSHAuthPollSeconds = 1
	runPending := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
		command := name + " " + strings.Join(args, " ")
		if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") {
			return "", fmt.Errorf("connection timed out")
		}
		return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
	}
	orchPending, _ := newHookOrchestrator(t, pendingCfg, runPending, runPending)
	if err := orchPending.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "did not pass within") {
		t.Fatalf("expected ssh auth timeout, got %v", err)
	}
}

// TestHookCoordinationPeerAndSnapshotFailureMatrix runs one orchestration or CLI step.
// Signature: TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T).
// Why: covers stale-intent guard branches and strict etcd snapshot verification
// failures that are hard to hit through full startup integration alone.
func TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T) {
	newOrch := func(t *testing.T, cfg config.Config, run func(context.Context, time.Duration, string, ...string) (string, error)) *cluster.Orchestrator {
		t.Helper()
		orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
		orch.SetCommandOverrides(run, run)
		return orch
	}

	cfg := lifecycleConfig(t)
	cfg.Coordination.PeerHosts = []string{"titan-24"}
	cfg.Coordination.Role = "worker"
	cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
	cfg.SSHNodeHosts["titan-24"] = "titan-24"

	t.Run("peer shutdown intent blocks startup", func(t *testing.T) {
		run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") {
				now := time.Now().UTC().Format(time.RFC3339)
				return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=shutting_down reason=\"ups\" source=peer updated_at=" + now + "\n", nil
			}
			return "ok", nil
		}
		orch := newOrch(t, cfg, run)
		if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err == nil || !strings.Contains(err.Error(), "active shutdown intent") {
			t.Fatalf("expected active shutdown block, got %v", err)
		}
	})

	t.Run("stale peer startup intent auto-clears", func(t *testing.T) {
		cleared := false
		run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
			command := name + " " + strings.Join(args, " ")
			switch {
			case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"):
				cleared = true
				return "ok", nil
			case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
				now := time.Now().UTC().Add(-30 * time.Minute).Format(time.RFC3339)
				return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + now + "\n", nil
			default:
				return "ok", nil
			}
		}
		orch := newOrch(t, cfg, run)
		if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil {
			t.Fatalf("expected stale startup intent to be cleared, got %v", err)
		}
		if !cleared {
			t.Fatalf("expected stale peer intent clear command to run")
		}
	})

	t.Run("verifyEtcdSnapshot strict validation branches", func(t *testing.T) {
		cases := []struct {
			name     string
			statOut  string
			statErr  error
			lsOut    string
			lsErr    error
			shaOut   string
			shaErr   error
			path     string
			wantPart string
		}{
			{name: "empty-path", path: " ", wantPart: "snapshot path is empty"},
			{name: "stat-error", path: "/snap", statErr: fmt.Errorf("stat failed"), wantPart: "verification failed"},
			{name: "size-parse-error", path: "/snap", statOut: "abc", wantPart: "parse size"},
			{name: "too-small", path: "/snap", statOut: "64", wantPart: "snapshot too small"},
			{name: "missing-in-list", path: "/snap", statOut: "2097152", lsOut: "/other", shaOut: strings.Repeat("a", 64), wantPart: "not present"},
			{name: "bad-sha", path: "/snap", statOut: "2097152", lsOut: "/snap", shaOut: "short", wantPart: "invalid sha256"},
		}
		for _, tc := range cases {
			tc := tc
			t.Run(tc.name, func(t *testing.T) {
				run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
					command := name + " " + strings.Join(args, " ")
					switch {
					case name == "ssh" && strings.Contains(command, "stat -c %s"):
						return tc.statOut, tc.statErr
					case name == "ssh" && strings.Contains(command, "sha256sum"):
						return tc.shaOut, tc.shaErr
					case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"):
						return tc.lsOut, tc.lsErr
					default:
						return "ok", nil
					}
				}
				orch := newOrch(t, cfg, run)
				err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", tc.path)
				if err == nil || !strings.Contains(err.Error(), tc.wantPart) {
					t.Fatalf("expected %q error, got %v", tc.wantPart, err)
				}
			})
		}
	})
}