ananke/testing/orchestrator/drill_lifecycle_branch_saturation_test.go

250 lines
10 KiB
Go
Raw Permalink Normal View History

package orchestrator
import (
"context"
"errors"
"io"
"log"
"net"
"os"
"strconv"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// newLifecycleSaturationOrchestrator runs one orchestration or CLI step.
// Signature: newLifecycleSaturationOrchestrator(t *testing.T, cfg config.Config, run commandOverride) *cluster.Orchestrator.
// Why: lifecycle branch saturation needs deterministic command behavior while preserving real intent/lock/file semantics.
func newLifecycleSaturationOrchestrator(
t *testing.T,
cfg config.Config,
run func(context.Context, time.Duration, string, ...string) (string, error),
) *cluster.Orchestrator {
t.Helper()
if run == nil {
run = lifecycleDispatcher(&commandRecorder{})
}
orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
orch.SetCommandOverrides(run, run)
return orch
}
// TestLifecycleStartupBranchSaturation runs one orchestration or CLI step.
// Signature: TestLifecycleStartupBranchSaturation(t *testing.T).
// Why: drives startup through its main error/safety branches so lifecycle coverage
// reflects realistic drill failure modes.
func TestLifecycleStartupBranchSaturation(t *testing.T) {
t.Run("read-intent-error-branch", func(t *testing.T) {
cfg := lifecycleConfig(t)
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentNormal,
Reason: "seed",
Source: "test",
UpdatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed intent: %v", err)
}
// Replace intent file with directory so ReadIntent fails.
if err := osRemove(cfg.State.IntentPath); err != nil {
t.Fatalf("remove intent file: %v", err)
}
if err := osMkdir(cfg.State.IntentPath); err != nil {
t.Fatalf("make intent dir: %v", err)
}
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-read-error"}); err == nil {
t.Fatalf("expected startup to fail when intent path is unreadable")
}
})
t.Run("fresh-shutdown-intent-blocks", func(t *testing.T) {
cfg := lifecycleConfig(t)
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentShuttingDown,
Reason: "active-shutdown",
Source: "test",
UpdatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("write shutdown intent: %v", err)
}
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "blocked-by-shutdown"})
if err == nil || !strings.Contains(err.Error(), "startup blocked: shutdown intent is active") {
t.Fatalf("expected active shutdown intent block, got %v", err)
}
})
t.Run("cooldown-cancel-branch", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.ShutdownCooldownSeconds = 20
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentShutdownComplete,
Reason: "just-finished",
Source: "test",
UpdatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("write cooldown intent: %v", err)
}
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
ctx, cancel := context.WithCancel(context.Background())
go func() {
time.Sleep(20 * time.Millisecond)
cancel()
}()
err := orch.Startup(ctx, cluster.StartupOptions{Reason: "cooldown-cancel"})
if err == nil || !strings.Contains(err.Error(), "startup canceled while waiting for shutdown cooldown") {
t.Fatalf("expected cooldown cancel branch, got %v", err)
}
})
t.Run("api-failure-without-restore", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = false
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "version --request-timeout=5s") {
return "", errors.New("apiserver down")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-fail-no-restore"})
if err == nil || !strings.Contains(err.Error(), "kubernetes API did not become reachable") {
t.Fatalf("expected api wait failure, got %v", err)
}
})
t.Run("api-failure-restore-not-applicable-retries", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("open local datastore listener: %v", err)
}
defer l.Close()
port := l.Addr().(*net.TCPAddr).Port
attempt := 0
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
attempt++
if attempt <= 1 {
return "", errors.New("apiserver down")
}
return "v1.31.0", nil
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:" + strconv.Itoa(port) + "/k3s", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "api-restore-not-applicable"}); err != nil {
t.Fatalf("expected startup success after retry, got %v", err)
}
})
t.Run("bootstrap-required-and-cache-missing-fails", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.IACRepoPath = t.TempDir()
cfg.LocalBootstrapPaths = []string{"services/bootstrap"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "jsonpath={.status.conditions[?(@.type==\"Ready\")].status}"):
return "", errors.New("flux source unavailable")
case name == "kubectl" && strings.Contains(command, " apply -k "):
return "", errors.New("apply failed")
case name == "sh" && strings.Contains(command, "kubectl kustomize"):
return "", errors.New("fallback failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bootstrap-required"})
if err == nil || !strings.Contains(err.Error(), "local bootstrap apply failed") {
t.Fatalf("expected bootstrap failure, got %v", err)
}
})
}
// TestLifecycleEtcdRestoreAndShutdownBranchSaturation runs one orchestration or CLI step.
// Signature: TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T).
// Why: covers restore/shutdown branch paths that are difficult to hit from a single happy-path drill.
func TestLifecycleEtcdRestoreAndShutdownBranchSaturation(t *testing.T) {
t.Run("etcd-restore-input-validation", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.ControlPlanes = nil
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{}); err == nil {
t.Fatalf("expected restore error with no control planes")
}
})
t.Run("etcd-restore-unmanaged-node", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.SSHManagedNodes = []string{"titan-23"}
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err == nil {
t.Fatalf("expected unmanaged control plane restore error")
}
})
t.Run("etcd-restore-command-failure", func(t *testing.T) {
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return "ExecStart=/usr/local/bin/k3s server", nil
case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return "2097152", nil
case name == "ssh" && strings.Contains(command, "sha256sum"):
return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
return "", errors.New("cluster reset failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch := newLifecycleSaturationOrchestrator(t, cfg, run)
err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"})
if err == nil || !strings.Contains(err.Error(), "etcd restore command failed") {
t.Fatalf("expected restore command failure branch, got %v", err)
}
})
t.Run("shutdown-invalid-mode", func(t *testing.T) {
cfg := lifecycleConfig(t)
orch := newLifecycleSaturationOrchestrator(t, cfg, nil)
if err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "bad-mode", Mode: "invalid"}); err == nil {
t.Fatalf("expected shutdown mode validation error")
}
})
}
// osRemove runs one orchestration or CLI step.
// Signature: osRemove(path string) error.
// Why: keeps error handling explicit in lifecycle branch tests without repeated ignore logic.
func osRemove(path string) error {
return os.Remove(path)
}
// osMkdir runs one orchestration or CLI step.
// Signature: osMkdir(path string) error.
// Why: keeps branch setup concise in lifecycle branch tests.
func osMkdir(path string) error {
return os.Mkdir(path, 0o755)
}