ananke/testing/orchestrator/hooks_gap_matrix_part1_test.go

225 lines
9.8 KiB
Go

package orchestrator
import (
"context"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookScalingStateSnapshotErrorBranches runs one orchestration or CLI step.
// Signature: TestHookScalingStateSnapshotErrorBranches(t *testing.T).
// Why: drives write/read/restore snapshot failures so scale lifecycle coverage
// captures filesystem edge cases seen during recovery drills.
func TestHookScalingStateSnapshotErrorBranches(t *testing.T) {
t.Run("scale-down fails when state dir is a file", func(t *testing.T) {
cfg := lifecycleConfig(t)
statePath := filepath.Join(t.TempDir(), "state-as-file")
if err := os.WriteFile(statePath, []byte("blocked"), 0o600); err != nil {
t.Fatalf("write state blocker file: %v", err)
}
cfg.State.Dir = statePath
cfg.State.RunHistoryPath = filepath.Join(filepath.Dir(statePath), "runs.json")
cfg.State.IntentPath = filepath.Join(filepath.Dir(statePath), "intent.txt")
cfg.State.LockPath = filepath.Join(filepath.Dir(statePath), "lock")
recorder := &commandRecorder{}
dispatch := lifecycleDispatcher(recorder)
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
orch.SetCommandOverrides(dispatch, dispatch)
err := orch.TestHookScaleDownApps(context.Background())
if err == nil || !strings.Contains(err.Error(), "ensure state dir") {
t.Fatalf("expected state-dir write failure, got %v", err)
}
})
t.Run("restore fails on corrupt snapshot and then succeeds", func(t *testing.T) {
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
snapshotPath := filepath.Join(cfg.State.Dir, "scaled-workloads.json")
if err := os.WriteFile(snapshotPath, []byte("{bad-json"), 0o600); err != nil {
t.Fatalf("write corrupt snapshot: %v", err)
}
if err := orch.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "decode scaled workload snapshot") {
t.Fatalf("expected decode failure, got %v", err)
}
valid := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}`
if err := os.WriteFile(snapshotPath, []byte(valid), 0o600); err != nil {
t.Fatalf("write valid snapshot: %v", err)
}
if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil {
t.Fatalf("restore scaled apps from snapshot: %v", err)
}
if _, err := os.Stat(snapshotPath); !os.IsNotExist(err) {
t.Fatalf("expected restore to remove snapshot, stat err=%v", err)
}
})
}
// TestHookAccessGateFailureMatrix runs one orchestration or CLI step.
// Signature: TestHookAccessGateFailureMatrix(t *testing.T).
// Why: ensures node access/auth gating distinguishes auth-denied versus
// transient reachability failures and returns deterministic startup errors.
func TestHookAccessGateFailureMatrix(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.RequireNodeSSHAuth = true
cfg.Startup.NodeSSHAuthWaitSeconds = 1
cfg.Startup.NodeSSHAuthPollSeconds = 1
cfg.Shutdown.SSHParallelism = 2
recorder := &commandRecorder{}
base := lifecycleDispatcher(recorder)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-db"):
recorder.record(name, args)
return "Permission denied (publickey)", fmt.Errorf("permission denied (publickey)")
case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23"):
recorder.record(name, args)
return "", fmt.Errorf("no route to host")
case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-db"):
recorder.record(name, args)
return "", fmt.Errorf("sudo denied")
default:
return base(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "access validation had") {
t.Fatalf("expected access validation failure, got %v", err)
}
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") {
t.Fatalf("expected ssh auth gate failure, got %v", err)
}
pendingCfg := lifecycleConfig(t)
pendingCfg.Startup.RequireNodeSSHAuth = true
pendingCfg.Startup.NodeSSHAuthWaitSeconds = 1
pendingCfg.Startup.NodeSSHAuthPollSeconds = 1
runPending := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") {
return "", fmt.Errorf("connection timed out")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchPending, _ := newHookOrchestrator(t, pendingCfg, runPending, runPending)
if err := orchPending.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "did not pass within") {
t.Fatalf("expected ssh auth timeout, got %v", err)
}
}
// TestHookCoordinationPeerAndSnapshotFailureMatrix runs one orchestration or CLI step.
// Signature: TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T).
// Why: covers stale-intent guard branches and strict etcd snapshot verification
// failures that are hard to hit through full startup integration alone.
func TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T) {
newOrch := func(t *testing.T, cfg config.Config, run func(context.Context, time.Duration, string, ...string) (string, error)) *cluster.Orchestrator {
t.Helper()
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
orch.SetCommandOverrides(run, run)
return orch
}
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{"titan-24"}
cfg.Coordination.Role = "worker"
cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
cfg.SSHNodeHosts["titan-24"] = "titan-24"
t.Run("peer shutdown intent blocks startup", func(t *testing.T) {
run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") {
now := time.Now().UTC().Format(time.RFC3339)
return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=shutting_down reason=\"ups\" source=peer updated_at=" + now + "\n", nil
}
return "ok", nil
}
orch := newOrch(t, cfg, run)
if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err == nil || !strings.Contains(err.Error(), "active shutdown intent") {
t.Fatalf("expected active shutdown block, got %v", err)
}
})
t.Run("stale peer startup intent auto-clears", func(t *testing.T) {
cleared := false
run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"):
cleared = true
return "ok", nil
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
now := time.Now().UTC().Add(-30 * time.Minute).Format(time.RFC3339)
return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + now + "\n", nil
default:
return "ok", nil
}
}
orch := newOrch(t, cfg, run)
if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil {
t.Fatalf("expected stale startup intent to be cleared, got %v", err)
}
if !cleared {
t.Fatalf("expected stale peer intent clear command to run")
}
})
t.Run("verifyEtcdSnapshot strict validation branches", func(t *testing.T) {
cases := []struct {
name string
statOut string
statErr error
lsOut string
lsErr error
shaOut string
shaErr error
path string
wantPart string
}{
{name: "empty-path", path: " ", wantPart: "snapshot path is empty"},
{name: "stat-error", path: "/snap", statErr: fmt.Errorf("stat failed"), wantPart: "verification failed"},
{name: "size-parse-error", path: "/snap", statOut: "abc", wantPart: "parse size"},
{name: "too-small", path: "/snap", statOut: "64", wantPart: "snapshot too small"},
{name: "missing-in-list", path: "/snap", statOut: "2097152", lsOut: "/other", shaOut: strings.Repeat("a", 64), wantPart: "not present"},
{name: "bad-sha", path: "/snap", statOut: "2097152", lsOut: "/snap", shaOut: "short", wantPart: "invalid sha256"},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return tc.statOut, tc.statErr
case name == "ssh" && strings.Contains(command, "sha256sum"):
return tc.shaOut, tc.shaErr
case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"):
return tc.lsOut, tc.lsErr
default:
return "ok", nil
}
}
orch := newOrch(t, cfg, run)
err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", tc.path)
if err == nil || !strings.Contains(err.Error(), tc.wantPart) {
t.Fatalf("expected %q error, got %v", tc.wantPart, err)
}
})
}
})
}