225 lines
9.8 KiB
Go
225 lines
9.8 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
)
|
|
|
|
// TestHookScalingStateSnapshotErrorBranches runs one orchestration or CLI step.
|
|
// Signature: TestHookScalingStateSnapshotErrorBranches(t *testing.T).
|
|
// Why: drives write/read/restore snapshot failures so scale lifecycle coverage
|
|
// captures filesystem edge cases seen during recovery drills.
|
|
func TestHookScalingStateSnapshotErrorBranches(t *testing.T) {
|
|
t.Run("scale-down fails when state dir is a file", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
statePath := filepath.Join(t.TempDir(), "state-as-file")
|
|
if err := os.WriteFile(statePath, []byte("blocked"), 0o600); err != nil {
|
|
t.Fatalf("write state blocker file: %v", err)
|
|
}
|
|
cfg.State.Dir = statePath
|
|
cfg.State.RunHistoryPath = filepath.Join(filepath.Dir(statePath), "runs.json")
|
|
cfg.State.IntentPath = filepath.Join(filepath.Dir(statePath), "intent.txt")
|
|
cfg.State.LockPath = filepath.Join(filepath.Dir(statePath), "lock")
|
|
recorder := &commandRecorder{}
|
|
dispatch := lifecycleDispatcher(recorder)
|
|
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
|
orch.SetCommandOverrides(dispatch, dispatch)
|
|
err := orch.TestHookScaleDownApps(context.Background())
|
|
if err == nil || !strings.Contains(err.Error(), "ensure state dir") {
|
|
t.Fatalf("expected state-dir write failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("restore fails on corrupt snapshot and then succeeds", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
snapshotPath := filepath.Join(cfg.State.Dir, "scaled-workloads.json")
|
|
if err := os.WriteFile(snapshotPath, []byte("{bad-json"), 0o600); err != nil {
|
|
t.Fatalf("write corrupt snapshot: %v", err)
|
|
}
|
|
if err := orch.TestHookRestoreScaledApps(context.Background()); err == nil || !strings.Contains(err.Error(), "decode scaled workload snapshot") {
|
|
t.Fatalf("expected decode failure, got %v", err)
|
|
}
|
|
|
|
valid := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}`
|
|
if err := os.WriteFile(snapshotPath, []byte(valid), 0o600); err != nil {
|
|
t.Fatalf("write valid snapshot: %v", err)
|
|
}
|
|
if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil {
|
|
t.Fatalf("restore scaled apps from snapshot: %v", err)
|
|
}
|
|
if _, err := os.Stat(snapshotPath); !os.IsNotExist(err) {
|
|
t.Fatalf("expected restore to remove snapshot, stat err=%v", err)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestHookAccessGateFailureMatrix runs one orchestration or CLI step.
|
|
// Signature: TestHookAccessGateFailureMatrix(t *testing.T).
|
|
// Why: ensures node access/auth gating distinguishes auth-denied versus
|
|
// transient reachability failures and returns deterministic startup errors.
|
|
func TestHookAccessGateFailureMatrix(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
cfg.Startup.RequireNodeSSHAuth = true
|
|
cfg.Startup.NodeSSHAuthWaitSeconds = 1
|
|
cfg.Startup.NodeSSHAuthPollSeconds = 1
|
|
cfg.Shutdown.SSHParallelism = 2
|
|
|
|
recorder := &commandRecorder{}
|
|
base := lifecycleDispatcher(recorder)
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-db"):
|
|
recorder.record(name, args)
|
|
return "Permission denied (publickey)", fmt.Errorf("permission denied (publickey)")
|
|
case name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") && strings.Contains(command, "titan-23"):
|
|
recorder.record(name, args)
|
|
return "", fmt.Errorf("no route to host")
|
|
case name == "ssh" && strings.Contains(command, "/usr/bin/systemctl --version") && strings.Contains(command, "titan-db"):
|
|
recorder.record(name, args)
|
|
return "", fmt.Errorf("sudo denied")
|
|
default:
|
|
return base(ctx, timeout, name, args...)
|
|
}
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
|
|
if err := orch.TestHookReconcileNodeAccess(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "access validation had") {
|
|
t.Fatalf("expected access validation failure, got %v", err)
|
|
}
|
|
if err := orch.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db", "titan-23"}); err == nil || !strings.Contains(err.Error(), "ssh auth gate failed") {
|
|
t.Fatalf("expected ssh auth gate failure, got %v", err)
|
|
}
|
|
|
|
pendingCfg := lifecycleConfig(t)
|
|
pendingCfg.Startup.RequireNodeSSHAuth = true
|
|
pendingCfg.Startup.NodeSSHAuthWaitSeconds = 1
|
|
pendingCfg.Startup.NodeSSHAuthPollSeconds = 1
|
|
runPending := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "ssh" && strings.Contains(command, "__ANANKE_SSH_AUTH_OK__") {
|
|
return "", fmt.Errorf("connection timed out")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchPending, _ := newHookOrchestrator(t, pendingCfg, runPending, runPending)
|
|
if err := orchPending.TestHookWaitForNodeSSHAuth(context.Background(), []string{"titan-db"}); err == nil || !strings.Contains(err.Error(), "did not pass within") {
|
|
t.Fatalf("expected ssh auth timeout, got %v", err)
|
|
}
|
|
}
|
|
|
|
// TestHookCoordinationPeerAndSnapshotFailureMatrix runs one orchestration or CLI step.
|
|
// Signature: TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T).
|
|
// Why: covers stale-intent guard branches and strict etcd snapshot verification
|
|
// failures that are hard to hit through full startup integration alone.
|
|
func TestHookCoordinationPeerAndSnapshotFailureMatrix(t *testing.T) {
|
|
newOrch := func(t *testing.T, cfg config.Config, run func(context.Context, time.Duration, string, ...string) (string, error)) *cluster.Orchestrator {
|
|
t.Helper()
|
|
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
|
orch.SetCommandOverrides(run, run)
|
|
return orch
|
|
}
|
|
|
|
cfg := lifecycleConfig(t)
|
|
cfg.Coordination.PeerHosts = []string{"titan-24"}
|
|
cfg.Coordination.Role = "worker"
|
|
cfg.SSHManagedNodes = append(cfg.SSHManagedNodes, "titan-24")
|
|
cfg.SSHNodeHosts["titan-24"] = "titan-24"
|
|
|
|
t.Run("peer shutdown intent blocks startup", func(t *testing.T) {
|
|
run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml") {
|
|
now := time.Now().UTC().Format(time.RFC3339)
|
|
return "__ANANKE_BOOTSTRAP_ACTIVE__\nintent=shutting_down reason=\"ups\" source=peer updated_at=" + now + "\n", nil
|
|
}
|
|
return "ok", nil
|
|
}
|
|
orch := newOrch(t, cfg, run)
|
|
if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err == nil || !strings.Contains(err.Error(), "active shutdown intent") {
|
|
t.Fatalf("expected active shutdown block, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("stale peer startup intent auto-clears", func(t *testing.T) {
|
|
cleared := false
|
|
run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml --set normal"):
|
|
cleared = true
|
|
return "ok", nil
|
|
case name == "ssh" && strings.Contains(command, "ananke intent --config /etc/ananke/ananke.yaml"):
|
|
now := time.Now().UTC().Add(-30 * time.Minute).Format(time.RFC3339)
|
|
return "__ANANKE_BOOTSTRAP_IDLE__\nintent=startup_in_progress reason=\"stale\" source=peer updated_at=" + now + "\n", nil
|
|
default:
|
|
return "ok", nil
|
|
}
|
|
}
|
|
orch := newOrch(t, cfg, run)
|
|
if err := orch.TestHookGuardPeerStartupIntents(context.Background()); err != nil {
|
|
t.Fatalf("expected stale startup intent to be cleared, got %v", err)
|
|
}
|
|
if !cleared {
|
|
t.Fatalf("expected stale peer intent clear command to run")
|
|
}
|
|
})
|
|
|
|
t.Run("verifyEtcdSnapshot strict validation branches", func(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
statOut string
|
|
statErr error
|
|
lsOut string
|
|
lsErr error
|
|
shaOut string
|
|
shaErr error
|
|
path string
|
|
wantPart string
|
|
}{
|
|
{name: "empty-path", path: " ", wantPart: "snapshot path is empty"},
|
|
{name: "stat-error", path: "/snap", statErr: fmt.Errorf("stat failed"), wantPart: "verification failed"},
|
|
{name: "size-parse-error", path: "/snap", statOut: "abc", wantPart: "parse size"},
|
|
{name: "too-small", path: "/snap", statOut: "64", wantPart: "snapshot too small"},
|
|
{name: "missing-in-list", path: "/snap", statOut: "2097152", lsOut: "/other", shaOut: strings.Repeat("a", 64), wantPart: "not present"},
|
|
{name: "bad-sha", path: "/snap", statOut: "2097152", lsOut: "/snap", shaOut: "short", wantPart: "invalid sha256"},
|
|
}
|
|
for _, tc := range cases {
|
|
tc := tc
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
run := func(_ context.Context, _ time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "ssh" && strings.Contains(command, "stat -c %s"):
|
|
return tc.statOut, tc.statErr
|
|
case name == "ssh" && strings.Contains(command, "sha256sum"):
|
|
return tc.shaOut, tc.shaErr
|
|
case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"):
|
|
return tc.lsOut, tc.lsErr
|
|
default:
|
|
return "ok", nil
|
|
}
|
|
}
|
|
orch := newOrch(t, cfg, run)
|
|
err := orch.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", tc.path)
|
|
if err == nil || !strings.Contains(err.Error(), tc.wantPart) {
|
|
t.Fatalf("expected %q error, got %v", tc.wantPart, err)
|
|
}
|
|
})
|
|
}
|
|
})
|
|
}
|