384 lines
16 KiB
Go
384 lines
16 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"net"
|
|
"os"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
)
|
|
|
|
// lifecycleFastConfig runs one orchestration or CLI step.
|
|
// Signature: lifecycleFastConfig(t *testing.T) config.Config.
|
|
// Why: lifecycle gap tests intentionally fail early in many branches, so short
|
|
// wait windows keep branch coverage runs fast and deterministic.
|
|
func lifecycleFastConfig(t *testing.T) config.Config {
|
|
t.Helper()
|
|
cfg := lifecycleConfig(t)
|
|
cfg.Startup.APIWaitSeconds = 1
|
|
cfg.Startup.APIPollSeconds = 1
|
|
cfg.Startup.NodeInventoryReachWaitSeconds = 1
|
|
cfg.Startup.NodeInventoryReachPollSeconds = 1
|
|
cfg.Startup.RequireCriticalServiceEndpoints = false
|
|
cfg.Startup.RequireFluxHealth = false
|
|
cfg.Startup.RequireWorkloadConvergence = false
|
|
cfg.Startup.RequireIngressChecklist = false
|
|
cfg.Startup.RequireServiceChecklist = false
|
|
cfg.Startup.ServiceChecklistStabilitySec = 0
|
|
cfg.Startup.RequirePostStartProbes = false
|
|
return cfg
|
|
}
|
|
|
|
// TestLifecycleDeepFailureMatrix runs one orchestration or CLI step.
|
|
// Signature: TestLifecycleDeepFailureMatrix(t *testing.T).
|
|
// Why: saturates remaining lifecycle startup/shutdown edge branches that are
|
|
// difficult to hit from happy-path drill tests.
|
|
func TestLifecycleDeepFailureMatrix(t *testing.T) {
|
|
t.Run("startup-lock-path-is-directory", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.State.LockPath = t.TempDir()
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lock-dir"}); err == nil {
|
|
t.Fatalf("expected lock-path directory failure")
|
|
}
|
|
})
|
|
|
|
t.Run("startup-node-inventory-validation-fails", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.SSHPort = 70000
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bad-inventory"})
|
|
if err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") {
|
|
t.Fatalf("expected node inventory preflight failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-node-reachability-fails", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
|
|
return "", errors.New("no route to host")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "reachability-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "node inventory reachability gate") {
|
|
t.Fatalf("expected reachability gate failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-clear-stale-startup-intent-write-fails", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
|
State: state.IntentStartupInProgress,
|
|
Reason: "stale",
|
|
Source: "test",
|
|
UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
|
|
}); err != nil {
|
|
t.Fatalf("seed stale startup intent: %v", err)
|
|
}
|
|
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
|
|
if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") {
|
|
return errors.New("forced intent clear failure")
|
|
}
|
|
return state.TestHookWriteIntentDefault(path, in)
|
|
})
|
|
t.Cleanup(restoreWrite)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"})
|
|
if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") {
|
|
t.Fatalf("expected stale startup clear failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-clear-stale-shutdown-intent-write-fails", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
|
State: state.IntentShuttingDown,
|
|
Reason: "stale-shutdown",
|
|
Source: "test",
|
|
UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
|
|
}); err != nil {
|
|
t.Fatalf("seed stale shutdown intent: %v", err)
|
|
}
|
|
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
|
|
if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") {
|
|
return errors.New("forced intent clear failure")
|
|
}
|
|
return state.TestHookWriteIntentDefault(path, in)
|
|
})
|
|
t.Cleanup(restoreWrite)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"})
|
|
if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") {
|
|
t.Fatalf("expected stale shutdown clear failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.RequireNodeInventoryReach = false
|
|
cfg.Startup.ShutdownCooldownSeconds = 5
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
|
State: state.IntentShutdownComplete,
|
|
Reason: "recent",
|
|
Source: "test",
|
|
UpdatedAt: time.Now().UTC(),
|
|
}); err != nil {
|
|
t.Fatalf("seed cooldown intent: %v", err)
|
|
}
|
|
go func(intentPath string) {
|
|
time.Sleep(2 * time.Second)
|
|
_ = os.Remove(intentPath)
|
|
_ = os.Mkdir(intentPath, 0o755)
|
|
}(cfg.State.IntentPath)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
|
|
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
|
|
t.Fatalf("expected cooldown reread failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.RequireNodeInventoryReach = false
|
|
cfg.Startup.ShutdownCooldownSeconds = 5
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
|
State: state.IntentShutdownComplete,
|
|
Reason: "recent",
|
|
Source: "test",
|
|
UpdatedAt: time.Now().UTC(),
|
|
}); err != nil {
|
|
t.Fatalf("seed cooldown intent: %v", err)
|
|
}
|
|
go func(intentPath string) {
|
|
time.Sleep(2 * time.Second)
|
|
_ = state.WriteIntent(intentPath, state.Intent{
|
|
State: state.IntentShuttingDown,
|
|
Reason: "peer-shutdown",
|
|
Source: "test",
|
|
UpdatedAt: time.Now().UTC(),
|
|
})
|
|
}(cfg.State.IntentPath)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
|
|
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
|
|
t.Fatalf("expected cooldown active-shutdown failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-set-intent-write-fails", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
|
|
State: state.IntentNormal,
|
|
Reason: "seed",
|
|
Source: "test",
|
|
UpdatedAt: time.Now().UTC(),
|
|
}); err != nil {
|
|
t.Fatalf("seed normal intent: %v", err)
|
|
}
|
|
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
|
|
if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress {
|
|
return errors.New("forced startup intent write failure")
|
|
}
|
|
return state.TestHookWriteIntentDefault(path, in)
|
|
})
|
|
t.Cleanup(restoreWrite)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "set startup intent") {
|
|
t.Fatalf("expected startup intent write failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-timesync-error-propagates", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.RequireTimeSync = true
|
|
cfg.Startup.TimeSyncWaitSeconds = 1
|
|
cfg.Startup.TimeSyncPollSeconds = 1
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "time sync") {
|
|
t.Fatalf("expected time sync failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-datastore-preflight-cancel", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
|
|
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
cancel()
|
|
err := orch.Startup(ctx, cluster.StartupOptions{Reason: "datastore-cancel"})
|
|
if !errors.Is(err, context.Canceled) {
|
|
t.Fatalf("expected canceled datastore preflight, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-auto-etcd-restore-hard-failure", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
|
|
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
|
return "", errors.New("api down")
|
|
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
|
|
return "ExecStart=/usr/local/bin/k3s server", nil
|
|
case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
|
|
return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
|
|
case name == "ssh" && strings.Contains(command, "stat -c %s"):
|
|
return "2097152", nil
|
|
case name == "ssh" && strings.Contains(command, "sha256sum"):
|
|
return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
|
|
case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
|
|
return "", errors.New("cluster reset failed")
|
|
default:
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-hard-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "automatic etcd restore failed") {
|
|
t.Fatalf("expected automatic etcd restore failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-auto-etcd-restore-not-applicable-then-api-still-down", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
|
|
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
|
|
listener, err := net.Listen("tcp", "127.0.0.1:0")
|
|
if err != nil {
|
|
t.Fatalf("open local datastore listener: %v", err)
|
|
}
|
|
defer listener.Close()
|
|
address := listener.Addr().String()
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
|
|
return "", errors.New("api still down")
|
|
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
|
|
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://" + address + "/k3s", nil
|
|
default:
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-not-applicable"})
|
|
if err == nil || !strings.Contains(err.Error(), "after automatic etcd restore") {
|
|
t.Fatalf("expected api failure after not-applicable restore path, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-required-node-labels-error", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
|
|
"titan-db": {"topology.kubernetes.io/zone": "lab-a"},
|
|
}
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "label node titan-db --overwrite") {
|
|
return "", errors.New("label denied")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "label-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "ensure required node labels") {
|
|
t.Fatalf("expected required-node-label failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-worker-discovery-error", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Workers = nil
|
|
cfg.SSHManagedNodes = []string{"titan-db"}
|
|
cfg.SSHNodeHosts = map[string]string{
|
|
"titan-db": "titan-db",
|
|
}
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" {
|
|
return "", errors.New("nodes denied")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "worker-discovery-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "discover workers") {
|
|
t.Fatalf("expected worker-discovery failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-storage-readiness-error", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
cfg.Startup.RequireStorageReady = true
|
|
cfg.Startup.StorageReadyWaitSeconds = 1
|
|
cfg.Startup.StorageReadyPollSeconds = 1
|
|
cfg.Startup.StorageMinReadyNodes = 1
|
|
cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"}
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
|
|
return "a:True:True\n", nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "storage-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "invalid storage_critical_pvcs entry") {
|
|
t.Fatalf("expected storage readiness failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-critical-workload-scale-error", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "scale deployment source-controller") {
|
|
return "", errors.New("scale denied")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "critical-scale-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "scale flux-system/deployment/source-controller") {
|
|
t.Fatalf("expected critical-workload scale failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-flux-resume-failure", func(t *testing.T) {
|
|
cfg := lifecycleFastConfig(t)
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" &&
|
|
strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io") &&
|
|
strings.Contains(command, "jsonpath={range .items[*]}{.metadata.name}") {
|
|
return "", errors.New("kustomization list denied")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "resume-fail"})
|
|
if err == nil || !strings.Contains(err.Error(), "kustomization list denied") {
|
|
t.Fatalf("expected flux-resume failure, got %v", err)
|
|
}
|
|
})
|
|
}
|