package orchestrator import ( "context" "fmt" "io" "log" "os" "path/filepath" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestHookScalingStateSnapshotErrorBranches runs one orchestration or CLI step. // Signature: TestHookScalingStateSnapshotErrorBranches(t *testing.T). // Why: drives write/read/restore snapshot failures so scale lifecycle coverage // captures filesystem edge cases seen during recovery drills. func TestHookScalingStateSnapshotErrorBranches(t *testing.T) { t.Run("scale-down fails when state dir is a file", func(t *testing.T) { cfg := lifecycleConfig(t) statePath := filepath.Join(t.TempDir(), "state-as-file") if err := os.WriteFile(statePath, []byte("blocked"), 0o600); err != nil { t.Fatalf("write state blocker file: %v", err) } cfg.State.Dir = statePath cfg.State.RunHistoryPath = filepath.Join(filepath.Dir(statePath), "runs.json") cfg.State.IntentPath = filepath.Join(filepath.Dir(statePath), "intent.txt") cfg.State.LockPath = filepath.Join(filepath.Dir(statePath), "lock") recorder := &commandRecorder{} dispatch := lifecycleDispatcher(recorder) orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(dispatch, dispatch) err := orch.TestHookScaleDownApps(context.Background()) if err == nil || !strings.Contains(err.Error(), "ensure state dir") { t.Fatalf("expected state-dir write failure, got %v", err) } }) t.Run("restore fails on corrupt snapshot and then succeeds", func(t *testing.T) { cfg := lifecycleConfig(t) orch, _ := newHookOrchestrator(t, cfg, nil, nil) snapshotPath := filepath.Join(cfg.State.Dir, "scaled-workloads.json") if err := os.WriteFile(snapshotPath, []byte("{bad-json"), 0o600); err != nil { t.Fatalf("write corrupt snapshot: %v", err) } if err := orch.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "decode scaled workload snapshot") { t.Fatalf("expected decode failure, got %v", err) } valid := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}` if err := os.WriteFile(snapshotPath, []byte(valid), 0o600); err != nil { t.Fatalf("write valid snapshot: %v", err) } if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil { t.Fatalf("restore scaled apps from snapshot: %v", err) } if _, err := os.Stat(snapshotPath); !os.IsNotExist(err) { t.Fatalf("expected restore to remove snapshot, stat err=%v", err) } }) } // TestHookAccessGateFailureMatrix runs one orchestration or CLI step. // Signature: TestHookAccessGateFailureMatrix(t *testing.T). // Why: ensures node access/auth gating distinguishes auth-denied versus // transient reachability failures and returns deterministic startup errors. func TestHookAccessGateFailureMatrix(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.RequireNodeSSHAuth = true cfg.Startup.NodeSSHAuthWaitSeconds = 1 cfg.Startup.NodeSSHAuthPollSeconds = 1 cfg.Shutdown.SSHParallelism = 2 recorder := &commandRecorder{} base := lifecycleDispatcher(recorder) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-db"): recorder.record(name, args) return "Permission denied (publickey)", fmt.Errorf("permission denied (publickey)") case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23"): recorder.record(name, args) return "", fmt.Errorf("no route to host") case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-db"): recorder.record(name, args) return "", fmt.Errorf("sudo denied") default: return base(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) if err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "access validation had") { t.Fatalf("expected access validation failure, got %v", err) } if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") { t.Fatalf("expected ssh auth gate failure, got %v", err) } pendingCfg := lifecycleConfig(t) pendingCfg.Startup.RequireNodeSSHAuth = true pendingCfg.Startup.NodeSSHAuthWaitSeconds = 1 pendingCfg.Startup.NodeSSHAuthPollSeconds = 1 runPending := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") { return "", fmt.Errorf("connection timed out") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchPending, _ := newHookOrchestrator(t, pendingCfg, runPending, runPending) if err := orchPending.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "did not pass within") { t.Fatalf("expected ssh auth timeout, got %v", err) } } // TestHookCoordinationPeerAndSnapshotFailureMatrix runs one orchestration or CLI step. // Signature: TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T). // Why: covers stale-intent guard branches and strict etcd snapshot verification // failures that are hard to hit through full startup integration alone. func TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T) { newOrch := func(t *testing.T, cfg config.Config, run func(context.Context, time.Duration, string, ...string) (string, error)) *cluster.Orchestrator { t.Helper() orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(run, run) return orch } cfg := lifecycleConfig(t) cfg.Coordination.PeerHosts = []string{"titan-24"} cfg.Coordination.Role = "worker" cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24") cfg.SSHNodeHosts["titan-24"] = "titan-24" t.Run("peer shutdown intent blocks startup", func(t *testing.T) { run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") { now := time.Now().UTC().Format(time.RFC3339) return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=shutting_down reason=\"ups\" source=peer updated_at=" + now + "\n", nil } return "ok", nil } orch := newOrch(t, cfg, run) if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err == nil || !strings.Contains(err.Error(), "active shutdown intent") { t.Fatalf("expected active shutdown block, got %v", err) } }) t.Run("stale peer startup intent auto-clears", func(t *testing.T) { cleared := false run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"): cleared = true return "ok", nil case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"): now := time.Now().UTC().Add(-30 * time.Minute).Format(time.RFC3339) return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + now + "\n", nil default: return "ok", nil } } orch := newOrch(t, cfg, run) if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil { t.Fatalf("expected stale startup intent to be cleared, got %v", err) } if !cleared { t.Fatalf("expected stale peer intent clear command to run") } }) t.Run("verifyEtcdSnapshot strict validation branches", func(t *testing.T) { cases := []struct { name string statOut string statErr error lsOut string lsErr error shaOut string shaErr error path string wantPart string }{ {name: "empty-path", path: " ", wantPart: "snapshot path is empty"}, {name: "stat-error", path: "/snap", statErr: fmt.Errorf("stat failed"), wantPart: "verification failed"}, {name: "size-parse-error", path: "/snap", statOut: "abc", wantPart: "parse size"}, {name: "too-small", path: "/snap", statOut: "64", wantPart: "snapshot too small"}, {name: "missing-in-list", path: "/snap", statOut: "2097152", lsOut: "/other", shaOut: strings.Repeat("a", 64), wantPart: "not present"}, {name: "bad-sha", path: "/snap", statOut: "2097152", lsOut: "/snap", shaOut: "short", wantPart: "invalid sha256"}, } for _, tc := range cases { tc := tc t.Run(tc.name, func(t *testing.T) { run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "ssh" && strings.Contains(command, "stat -c %s"): return tc.statOut, tc.statErr case name == "ssh" && strings.Contains(command, "sha256sum"): return tc.shaOut, tc.shaErr case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"): return tc.lsOut, tc.lsErr default: return "ok", nil } } orch := newOrch(t, cfg, run) err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", tc.path) if err == nil || !strings.Contains(err.Error(), tc.wantPart) { t.Fatalf("expected %q error, got %v", tc.wantPart, err) } }) } }) }