ananke/testing/orchestrator/hooks_lifecycle_deep_matrix_test.go

392 lines
16 KiB
Go
Raw Permalink Normal View History

package orchestrator
import (
"context"
"errors"
"net"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// lifecycleFastConfig runs one orchestration or CLI step.
// Signature: lifecycleFastConfig(t *testing.T) config.Config.
// Why: lifecycle gap tests intentionally fail early in many branches, so short
// wait windows keep branch coverage runs fast and deterministic.
func lifecycleFastConfig(t *testing.T) config.Config {
t.Helper()
cfg := lifecycleConfig(t)
cfg.Startup.APIWaitSeconds = 1
cfg.Startup.APIPollSeconds = 1
cfg.Startup.NodeInventoryReachWaitSeconds = 1
cfg.Startup.NodeInventoryReachPollSeconds = 1
cfg.Startup.RequireCriticalServiceEndpoints = false
cfg.Startup.RequireFluxHealth = false
cfg.Startup.RequireWorkloadConvergence = false
cfg.Startup.RequireIngressChecklist = false
cfg.Startup.RequireServiceChecklist = false
cfg.Startup.ServiceChecklistStabilitySec = 0
cfg.Startup.RequirePostStartProbes = false
return cfg
}
// TestLifecycleDeepFailureMatrix runs one orchestration or CLI step.
// Signature: TestLifecycleDeepFailureMatrix(t *testing.T).
// Why: saturates remaining lifecycle startup/shutdown edge branches that are
// difficult to hit from happy-path drill tests.
func TestLifecycleDeepFailureMatrix(t *testing.T) {
t.Run("startup-lock-path-is-directory", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.State.LockPath = t.TempDir()
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "lock-dir"}); err == nil {
t.Fatalf("expected lock-path directory failure")
}
})
t.Run("startup-node-inventory-validation-fails", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.SSHPort = 70000
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "bad-inventory"})
if err == nil || !strings.Contains(err.Error(), "node inventory preflight failed") {
t.Fatalf("expected node inventory preflight failure, got %v", err)
}
})
t.Run("startup-node-reachability-fails", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "__ANANKE_NODE_REACHABLE__") {
return "", errors.New("no route to host")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "reachability-fail"})
if err == nil || !strings.Contains(err.Error(), "node inventory reachability gate") {
t.Fatalf("expected reachability gate failure, got %v", err)
}
})
t.Run("startup-clear-stale-startup-intent-write-fails", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentStartupInProgress,
Reason: "stale",
Source: "test",
UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
}); err != nil {
t.Fatalf("seed stale startup intent: %v", err)
}
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale startup intent") {
return errors.New("forced intent clear failure")
}
return state.TestHookWriteIntentDefault(path, in)
})
t.Cleanup(restoreWrite)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-startup"})
if err == nil || !strings.Contains(err.Error(), "clear stale startup intent") {
t.Fatalf("expected stale startup clear failure, got %v", err)
}
})
t.Run("startup-clear-stale-shutdown-intent-write-fails", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentShuttingDown,
Reason: "stale-shutdown",
Source: "test",
UpdatedAt: time.Now().UTC().Add(-2 * time.Hour),
}); err != nil {
t.Fatalf("seed stale shutdown intent: %v", err)
}
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
if path == cfg.State.IntentPath && in.State == state.IntentNormal && strings.Contains(strings.ToLower(in.Reason), "auto-clear stale shutdown intent") {
return errors.New("forced intent clear failure")
}
return state.TestHookWriteIntentDefault(path, in)
})
t.Cleanup(restoreWrite)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "clear-stale-shutdown"})
if err == nil || !strings.Contains(err.Error(), "clear stale shutdown intent") {
t.Fatalf("expected stale shutdown clear failure, got %v", err)
}
})
t.Run("startup-cooldown-reread-intent-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequireNodeInventoryReach = false
cfg.Startup.ShutdownCooldownSeconds = 5
reads := 0
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
if path != cfg.State.IntentPath {
return state.TestHookReadIntentDefault(path)
}
reads++
if reads == 1 {
return state.Intent{
State: state.IntentShutdownComplete,
Reason: "recent",
Source: "test",
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
}, nil
}
return state.Intent{}, errors.New("forced reread failure")
})
t.Cleanup(restoreRead)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-reread"})
if err == nil || !strings.Contains(err.Error(), "re-read startup intent after cooldown wait") {
t.Fatalf("expected cooldown reread failure, got %v", err)
}
})
t.Run("startup-cooldown-shutdown-became-active", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequireNodeInventoryReach = false
cfg.Startup.ShutdownCooldownSeconds = 5
reads := 0
restoreRead := state.TestHookSetReadIntentOverride(func(path string) (state.Intent, error) {
if path != cfg.State.IntentPath {
return state.TestHookReadIntentDefault(path)
}
reads++
if reads == 1 {
return state.Intent{
State: state.IntentShutdownComplete,
Reason: "recent",
Source: "test",
UpdatedAt: time.Now().UTC().Add(-4 * time.Second),
}, nil
}
return state.Intent{
State: state.IntentShuttingDown,
Reason: "peer-shutdown",
Source: "test",
UpdatedAt: time.Now().UTC(),
}, nil
})
t.Cleanup(restoreRead)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "cooldown-peer-active"})
if err == nil || !strings.Contains(err.Error(), "shutdown intent became active during cooldown wait") {
t.Fatalf("expected cooldown active-shutdown failure, got %v", err)
}
})
t.Run("startup-set-intent-write-fails", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
if err := state.WriteIntent(cfg.State.IntentPath, state.Intent{
State: state.IntentNormal,
Reason: "seed",
Source: "test",
UpdatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed normal intent: %v", err)
}
restoreWrite := state.TestHookSetWriteIntentOverride(func(path string, in state.Intent) error {
if path == cfg.State.IntentPath && in.State == state.IntentStartupInProgress {
return errors.New("forced startup intent write failure")
}
return state.TestHookWriteIntentDefault(path, in)
})
t.Cleanup(restoreWrite)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "intent-write-fail"})
if err == nil || !strings.Contains(err.Error(), "set startup intent") {
t.Fatalf("expected startup intent write failure, got %v", err)
}
})
t.Run("startup-timesync-error-propagates", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequireTimeSync = true
cfg.Startup.TimeSyncWaitSeconds = 1
cfg.Startup.TimeSyncPollSeconds = 1
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "timesync-fail"})
if err == nil || !strings.Contains(err.Error(), "time sync") {
t.Fatalf("expected time sync failure, got %v", err)
}
})
t.Run("startup-datastore-preflight-cancel", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "systemctl cat k3s") {
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:1/k3s", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ctx, cancel := context.WithCancel(context.Background())
cancel()
err := orch.Startup(ctx, cluster.StartupOptions{Reason: "datastore-cancel"})
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected canceled datastore preflight, got %v", err)
}
})
t.Run("startup-auto-etcd-restore-hard-failure", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "", errors.New("api down")
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return "ExecStart=/usr/local/bin/k3s server", nil
case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return "2097152", nil
case name == "ssh" && strings.Contains(command, "sha256sum"):
return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
return "", errors.New("cluster reset failed")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-hard-fail"})
if err == nil || !strings.Contains(err.Error(), "automatic etcd restore failed") {
t.Fatalf("expected automatic etcd restore failure, got %v", err)
}
})
t.Run("startup-auto-etcd-restore-not-applicable-then-api-still-down", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.AutoEtcdRestoreOnAPIFailure = true
cfg.Startup.EtcdRestoreControlPlane = "titan-db"
listener, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("open local datastore listener: %v", err)
}
defer listener.Close()
address := listener.Addr().String()
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "version --request-timeout=5s"):
return "", errors.New("api still down")
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://" + address + "/k3s", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err = orch.Startup(context.Background(), cluster.StartupOptions{Reason: "restore-not-applicable"})
if err == nil || !strings.Contains(err.Error(), "after automatic etcd restore") {
t.Fatalf("expected api failure after not-applicable restore path, got %v", err)
}
})
t.Run("startup-required-node-labels-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequiredNodeLabels = map[string]map[string]string{
"titan-db": {"topology.kubernetes.io/zone": "lab-a"},
}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "label node titan-db --overwrite") {
return "", errors.New("label denied")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "label-fail"})
if err == nil || !strings.Contains(err.Error(), "ensure required node labels") {
t.Fatalf("expected required-node-label failure, got %v", err)
}
})
t.Run("startup-worker-discovery-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Workers = nil
cfg.SSHManagedNodes = []string{"titan-db"}
cfg.SSHNodeHosts = map[string]string{
"titan-db": "titan-db",
}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
if name == "kubectl" && len(args) >= 2 && args[0] == "get" && args[1] == "nodes" {
return "", errors.New("nodes denied")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "worker-discovery-fail"})
if err == nil || !strings.Contains(err.Error(), "discover workers") {
t.Fatalf("expected worker-discovery failure, got %v", err)
}
})
t.Run("startup-storage-readiness-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
cfg.Startup.RequireStorageReady = true
cfg.Startup.StorageReadyWaitSeconds = 1
cfg.Startup.StorageReadyPollSeconds = 1
cfg.Startup.StorageMinReadyNodes = 1
cfg.Startup.StorageCriticalPVCs = []string{"bad-entry"}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
return "a:True:True\n", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "storage-fail"})
if err == nil || !strings.Contains(err.Error(), "invalid storage_critical_pvcs entry") {
t.Fatalf("expected storage readiness failure, got %v", err)
}
})
t.Run("startup-critical-workload-scale-error", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "scale deployment source-controller") {
return "", errors.New("scale denied")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "critical-scale-fail"})
if err == nil || !strings.Contains(err.Error(), "scale flux-system/deployment/source-controller") {
t.Fatalf("expected critical-workload scale failure, got %v", err)
}
})
t.Run("startup-flux-resume-failure", func(t *testing.T) {
cfg := lifecycleFastConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" &&
strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io") &&
strings.Contains(command, "jsonpath={range .items[*]}{.metadata.name}") {
return "", errors.New("kustomization list denied")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Startup(context.Background(), cluster.StartupOptions{Reason: "resume-fail"})
if err == nil || !strings.Contains(err.Error(), "kustomization list denied") {
t.Fatalf("expected flux-resume failure, got %v", err)
}
})
}