250 lines
10 KiB
Go
250 lines
10 KiB
Go
|
|
package orchestrator
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"errors"
|
||
|
|
"io"
|
||
|
|
"log"
|
||
|
|
"net"
|
||
|
|
"os"
|
||
|
|
"strconv"
|
||
|
|
"strings"
|
||
|
|
"testing"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||
|
|
)
|
||
|
|
|
||
|
|
// newLifecycleSaturationOrchestrator runs one orchestration or CLI step.
|
||
|
|
// Signature: newLifecycleSaturationOrchestrator(t *testing.T, cfg config.Config, run commandOverride) *cluster.Orchestrator.
|
||
|
|
// Why: lifecycle branch saturation needs deterministic command behavior while preserving real intent/lock/file semantics.
|
||
|
|
func newLifecycleSaturationOrchestrator(
|
||
|
|
t *testing.T,
|
||
|
|
cfg config.Config,
|
||
|
|
run func(context.Context, time.Duration, string, ...string) (string, error),
|
||
|
|
) *cluster.Orchestrator {
|
||
|
|
t.Helper()
|
||
|
|
if run == nil {
|
||
|
|
run = lifecycleDispatcher(&commandRecorder{})
|
||
|
|
}
|
||
|
|
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
||
|
|
orch.SetCommandOverrides(run, run)
|
||
|
|
return orch
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestLifecycleStartupBranchSaturation runs one orchestration or CLI step.
|
||
|
|
// Signature: TestLifecycleStartupBranchSaturation(t *testing.T).
|
||
|
|
// Why: drives startup through its main error/safety branches so lifecycle coverage
|
||
|
|
// reflects realistic drill failure modes.
|
||
|
|
func TestLifecycleStartupBranchSaturation(t *testing.T) {
|
||
|
|
t.Run("read-intent-error-branch", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||
|
|
State: state.IntentNormal,
|
||
|
|
Reason: "seed",
|
||
|
|
Source: "test",
|
||
|
|
UpdatedAt: time.Now().UTC(),
|
||
|
|
}); err != nil {
|
||
|
|
t.Fatalf("seed intent: %v", err)
|
||
|
|
}
|
||
|
|
// Replace intent file with directory so ReadIntent fails.
|
||
|
|
if err := osRemove(cfg.State.IntentPath); err != nil {
|
||
|
|
t.Fatalf("remove intent file: %v", err)
|
||
|
|
}
|
||
|
|
if err := osMkdir(cfg.State.IntentPath); err != nil {
|
||
|
|
t.Fatalf("make intent dir: %v", err)
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
|
||
|
|
if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-read-error"}); err == nil {
|
||
|
|
t.Fatalf("expected startup to fail when intent path is unreadable")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("fresh-shutdown-intent-blocks", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||
|
|
State: state.IntentShuttingDown,
|
||
|
|
Reason: "active-shutdown",
|
||
|
|
Source: "test",
|
||
|
|
UpdatedAt: time.Now().UTC(),
|
||
|
|
}); err != nil {
|
||
|
|
t.Fatalf("write shutdown intent: %v", err)
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
|
||
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "blocked-by-shutdown"})
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "startup blocked: shutdown intent is active") {
|
||
|
|
t.Fatalf("expected active shutdown intent block, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("cooldown-cancel-branch", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.ShutdownCooldownSeconds = 20
|
||
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
||
|
|
State: state.IntentShutdownComplete,
|
||
|
|
Reason: "just-finished",
|
||
|
|
Source: "test",
|
||
|
|
UpdatedAt: time.Now().UTC(),
|
||
|
|
}); err != nil {
|
||
|
|
t.Fatalf("write cooldown intent: %v", err)
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
|
||
|
|
ctx, cancel := context.WithCancel(context.Background())
|
||
|
|
go func() {
|
||
|
|
time.Sleep(20 * time.Millisecond)
|
||
|
|
cancel()
|
||
|
|
}()
|
||
|
|
err := orch.Startup(ctx, cluster.StartupOptions{Reason: "cooldown-cancel"})
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "startup canceled while waiting for shutdown cooldown") {
|
||
|
|
t.Fatalf("expected cooldown cancel branch, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("api-failure-without-restore", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.AutoEtcdRestoreOnAPIFailure = false
|
||
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
|
||
|
|
return "", errors.New("apiserver down")
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
|
||
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-fail-no-restore"})
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "kubernetes API did not become reachable") {
|
||
|
|
t.Fatalf("expected api wait failure, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("api-failure-restore-not-applicable-retries", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
|
||
|
|
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
|
||
|
|
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("open local datastore listener: %v", err)
|
||
|
|
}
|
||
|
|
defer l.Close()
|
||
|
|
port := l.Addr().(*net.TCPAddr).Port
|
||
|
|
attempt := 0
|
||
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
||
|
|
attempt++
|
||
|
|
if attempt <= 1 {
|
||
|
|
return "", errors.New("apiserver down")
|
||
|
|
}
|
||
|
|
return "v1.31.0", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
|
||
|
|
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:" + strconv.Itoa(port) + "/k3s", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
|
||
|
|
if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-restore-not-applicable"}); err != nil {
|
||
|
|
t.Fatalf("expected startup success after retry, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("bootstrap-required-and-cache-missing-fails", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.IACRepoPath = t.TempDir()
|
||
|
|
cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
|
||
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
|
||
|
|
return "", errors.New("flux source unavailable")
|
||
|
|
case name == "kubectl" && strings.Contains(command, " apply -k "):
|
||
|
|
return "", errors.New("apply failed")
|
||
|
|
case name == "sh" && strings.Contains(command, "kubectl kustomize"):
|
||
|
|
return "", errors.New("fallback failed")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
|
||
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-required"})
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "local bootstrap apply failed") {
|
||
|
|
t.Fatalf("expected bootstrap failure, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestLifecycleEtcdRestoreAndShutdownBranchSaturation runs one orchestration or CLI step.
|
||
|
|
// Signature: TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T).
|
||
|
|
// Why: covers restore/shutdown branch paths that are difficult to hit from a single happy-path drill.
|
||
|
|
func TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T) {
|
||
|
|
t.Run("etcd-restore-input-validation", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.ControlPlanes = nil
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
|
||
|
|
if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil {
|
||
|
|
t.Fatalf("expected restore error with no control planes")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("etcd-restore-unmanaged-node", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.SSHManagedNodes = []string{"titan-23"}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
|
||
|
|
if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil {
|
||
|
|
t.Fatalf("expected unmanaged control plane restore error")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("etcd-restore-command-failure", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
|
||
|
|
return "ExecStart=/usr/local/bin/k3s server", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
|
||
|
|
return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "stat -c %s"):
|
||
|
|
return "2097152", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "sha256sum"):
|
||
|
|
return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
|
||
|
|
case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
|
||
|
|
return "", errors.New("cluster reset failed")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
|
||
|
|
err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"})
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "etcd restore command failed") {
|
||
|
|
t.Fatalf("expected restore command failure branch, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("shutdown-invalid-mode", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
|
||
|
|
if err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "bad-mode", Mode: "invalid"}); err == nil {
|
||
|
|
t.Fatalf("expected shutdown mode validation error")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// osRemove runs one orchestration or CLI step.
|
||
|
|
// Signature: osRemove(path string) error.
|
||
|
|
// Why: keeps error handling explicit in lifecycle branch tests without repeated ignore logic.
|
||
|
|
func osRemove(path string) error {
|
||
|
|
return os.Remove(path)
|
||
|
|
}
|
||
|
|
|
||
|
|
// osMkdir runs one orchestration or CLI step.
|
||
|
|
// Signature: osMkdir(path string) error.
|
||
|
|
// Why: keeps branch setup concise in lifecycle branch tests.
|
||
|
|
func osMkdir(path string) error {
|
||
|
|
return os.Mkdir(path, 0o755)
|
||
|
|
}
|