ananke/testing/orchestrator/hooks_gap_matrix_part3_test.go

175 lines
8.4 KiB
Go
Raw Normal View History

package orchestrator
import (
"context"
"errors"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
)
// TestHookGapMatrixPart3ConvergenceAndStability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T).
// Why: raises coverage for startup convergence orchestration and stability gates
// that determine whether startup is considered truly complete.
func TestHookGapMatrixPart3ConvergenceAndStability(t *testing.T) {
t.Run("wait-for-startup-convergence-gate-matrix", func(t *testing.T) {
cfgIngress := lifecycleConfig(t)
cfgIngress.Startup.RequireIngressChecklist = true
cfgIngress.Startup.IngressChecklistWaitSeconds = 1
cfgIngress.Startup.IngressChecklistPollSeconds = 1
cfgIngress.Startup.RequireServiceChecklist = false
cfgIngress.Startup.RequireCriticalServiceEndpoints = false
cfgIngress.Startup.RequireFluxHealth = false
cfgIngress.Startup.RequireWorkloadConvergence = false
cfgIngress.Startup.ServiceChecklistStabilitySec = 0
runIngress := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get ingress -A -o json") {
return `{"items":[{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"rules":[{"host":"127.0.0.1:1"}]}}]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchIngress, _ := newHookOrchestrator(t, cfgIngress, runIngress, runIngress)
if err := orchIngress.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "ingress checklist") {
t.Fatalf("expected ingress convergence failure, got %v", err)
}
cfgService := lifecycleConfig(t)
cfgService.Startup.RequireServiceChecklist = true
cfgService.Startup.ServiceChecklistWaitSeconds = 1
cfgService.Startup.ServiceChecklistPollSeconds = 1
cfgService.Startup.RequireIngressChecklist = false
cfgService.Startup.RequireCriticalServiceEndpoints = false
cfgService.Startup.RequireFluxHealth = false
cfgService.Startup.RequireWorkloadConvergence = false
cfgService.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
{Name: "api", URL: "http://127.0.0.1:1/health", AcceptedStatuses: []int{200}, TimeoutSeconds: 1},
}
orchService, _ := newHookOrchestrator(t, cfgService, nil, nil)
if err := orchService.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "service checklist") {
t.Fatalf("expected service convergence failure, got %v", err)
}
cfgFlux := lifecycleConfig(t)
cfgFlux.Startup.RequireIngressChecklist = false
cfgFlux.Startup.RequireServiceChecklist = false
cfgFlux.Startup.RequireCriticalServiceEndpoints = false
cfgFlux.Startup.RequireFluxHealth = true
cfgFlux.Startup.FluxHealthWaitSeconds = 1
cfgFlux.Startup.FluxHealthPollSeconds = 1
cfgFlux.Startup.RequireWorkloadConvergence = false
runFlux := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"syncing"}]}}]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchFlux, _ := newHookOrchestrator(t, cfgFlux, runFlux, runFlux)
if err := orchFlux.TestHookWaitForStartupConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "flux convergence") {
t.Fatalf("expected flux convergence failure, got %v", err)
}
})
t.Run("startup-stability-success-and-pod-check-error", func(t *testing.T) {
cfgOK := lifecycleConfig(t)
cfgOK.Startup.RequireFluxHealth = false
cfgOK.Startup.RequireWorkloadConvergence = false
cfgOK.Startup.RequireServiceChecklist = false
cfgOK.Startup.RequireIngressChecklist = false
runOK := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return `{"items":[]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchOK, _ := newHookOrchestrator(t, cfgOK, runOK, runOK)
if err := orchOK.TestHookStartupStabilityHealthy(context.Background()); err != nil {
t.Fatalf("expected startup stability success, got %v", err)
}
cfgErr := cfgOK
runErr := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return "", errors.New("pod list failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestrator(t, cfgErr, runErr, runErr)
if err := orchErr.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "pod failure check error") {
t.Fatalf("expected pod-check error branch, got %v", err)
}
})
}
// TestHookGapMatrixPart3LifecycleRestoreShutdown runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T).
// Why: fills lifecycle restore/shutdown success paths that are easy to miss in
// failure-focused drill tests.
func TestHookGapMatrixPart3LifecycleRestoreShutdown(t *testing.T) {
t.Run("etcd-restore-dry-run-and-success", func(t *testing.T) {
cfgDry := lifecycleConfig(t)
dry := newDryRunHookOrchestrator(t, cfgDry, nil)
if err := dry.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err != nil {
t.Fatalf("expected dry-run etcd restore success, got %v", err)
}
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "systemctl cat k3s"):
return "ExecStart=/usr/local/bin/k3s server", nil
case name == "ssh" && strings.Contains(command, "etcd-snapshot ls"):
return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
case name == "ssh" && strings.Contains(command, "stat -c %s"):
return "2097152", nil
case name == "ssh" && strings.Contains(command, "sha256sum"):
return "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", nil
case name == "ssh" && strings.Contains(command, "server --cluster-reset"):
return "reset done", nil
case name == "ssh" && strings.Contains(command, "systemctl stop k3s"):
return "stopped", nil
case name == "ssh" && strings.Contains(command, "systemctl start k3s"):
return "started", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.EtcdRestore(context.Background(), cluster.EtcdRestoreOptions{ControlPlane: "titan-db"}); err != nil {
t.Fatalf("expected etcd restore success path, got %v", err)
}
})
t.Run("shutdown-full-path-cluster-only", func(t *testing.T) {
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot save"):
return "saved", nil
case name == "ssh" && strings.Contains(command, "k3s etcd-snapshot ls"):
return "/var/lib/rancher/k3s/server/db/snapshots/pre-shutdown", nil
case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
return "monitoring\tgrafana\t1\n", nil
case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.Shutdown(context.Background(), cluster.ShutdownOptions{Reason: "full", Mode: "cluster-only"})
if err != nil {
t.Fatalf("expected full shutdown success, got %v", err)
}
})
}