ananke/testing/orchestrator/hooks_gap_matrix_part2_test.go

359 lines
17 KiB
Go

package orchestrator
import (
"context"
"errors"
"io"
"log"
"os"
"path/filepath"
"strings"
"testing"
"time"
"scm.bstein.dev/bstein/ananke/internal/cluster"
"scm.bstein.dev/bstein/ananke/internal/config"
"scm.bstein.dev/bstein/ananke/internal/execx"
"scm.bstein.dev/bstein/ananke/internal/state"
)
// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
// branches from the top-level testing module.
func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
cases := []struct {
line string
want string
}{
{"ExecStart=/usr/local/bin/k3s server", ""},
{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://db:5432/k3s", "postgres://db:5432/k3s"},
{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint='postgres://db:5432/k3s' \\", "postgres://db:5432/k3s"},
{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint = \"postgres://db:5432/k3s\" \\", "="},
{"X --datastore-endpoint= \"postgres://db:5432/k3s\" ", "postgres://db:5432/k3s"},
}
for _, tc := range cases {
got := cluster.TestHookParseDatastoreEndpoint(tc.line)
if got != tc.want {
t.Fatalf("parseDatastoreEndpoint(%q)=%q want %q", tc.line, got, tc.want)
}
}
})
t.Run("wait-for-time-sync-strict-timeout", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.TimeSyncMode = "strict"
cfg.Startup.TimeSyncWaitSeconds = 1
cfg.Startup.TimeSyncPollSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") {
return "no", nil
}
if name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") {
return "no", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"})
if err == nil || !strings.Contains(err.Error(), "time sync not ready") {
t.Fatalf("expected strict time-sync timeout branch, got %v", err)
}
})
t.Run("wait-for-time-sync-quorum-success", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.ControlPlanes = []string{"titan-db", "titan-23"}
cfg.Startup.TimeSyncMode = "quorum"
cfg.Startup.TimeSyncQuorum = 1
cfg.Startup.TimeSyncWaitSeconds = 2
cfg.Startup.TimeSyncPollSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
return "yes", nil
case name == "ssh" && strings.Contains(command, "titan-db") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
return "yes", nil
case name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
return "no", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
t.Fatalf("expected quorum time-sync success, got %v", err)
}
})
t.Run("startup-stability-failure-matrix", func(t *testing.T) {
baseCfg := lifecycleConfig(t)
baseCfg.Startup.RequireIngressChecklist = false
baseCfg.Startup.RequireServiceChecklist = false
baseCfg.Startup.RequireWorkloadConvergence = false
baseCfg.Startup.RequireFluxHealth = false
runPodsCrash := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return `{"items":[{"metadata":{"namespace":"default","name":"bad-pod"},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchCrash, _ := newHookOrchestrator(t, baseCfg, runPodsCrash, runPodsCrash)
if err := orchCrash.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "pods in crash/image-pull failures") {
t.Fatalf("expected crashloop stability failure, got %v", err)
}
cfgFlux := baseCfg
cfgFlux.Startup.RequireFluxHealth = true
runFlux := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"syncing"}]}}]}`, nil
}
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return `{"items":[]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchFlux, _ := newHookOrchestrator(t, cfgFlux, runFlux, runFlux)
if err := orchFlux.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "flux not ready") {
t.Fatalf("expected flux-not-ready stability failure, got %v", err)
}
cfgWork := baseCfg
cfgWork.Startup.RequireWorkloadConvergence = true
runWork := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"default","name":"app"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
return `{"items":[]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchWork, _ := newHookOrchestrator(t, cfgWork, runWork, runWork)
if err := orchWork.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "workloads not converged") {
t.Fatalf("expected workload convergence stability failure, got %v", err)
}
cfgService := baseCfg
cfgService.Startup.RequireServiceChecklist = true
cfgService.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
{Name: "api", URL: "http://127.0.0.1:1/health", AcceptedStatuses: []int{200}, TimeoutSeconds: 1},
}
runService := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return `{"items":[]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchService, _ := newHookOrchestrator(t, cfgService, runService, runService)
if err := orchService.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "external services not healthy") {
t.Fatalf("expected service checklist stability failure, got %v", err)
}
})
}
// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
// Why: targets low branch density in flux-health, scaling snapshot handling,
// and report sanitization helpers.
func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
t.Run("flux-helper-matrix", func(t *testing.T) {
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
t.Fatalf("expected immutable matcher true for uppercase+job variant")
}
if cluster.TestHookLooksLikeImmutableJobError("totally unrelated error") {
t.Fatalf("expected immutable matcher false")
}
if !cluster.TestHookJobLooksFluxManaged("flux-system", "job-a", map[string]string{"kustomize.toolkit.fluxcd.io/name": "services"}, nil) {
t.Fatalf("expected flux-managed job by kustomize label")
}
if cluster.TestHookJobFailed(1, 0, []string{"Failed"}, []string{"True"}) != true {
t.Fatalf("expected Failed=True to mark job failed")
}
cfg := lifecycleConfig(t)
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
return `{"items":[{"metadata":{"namespace":"flux-system","name":"job-a","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "delete job -n flux-system job-a"):
return "", nil
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
if err != nil || !healed {
t.Fatalf("expected immutable flux job heal success, healed=%t err=%v", healed, err)
}
if _, _, err := orch.TestHookAdaptiveFluxHealthWait(context.Background(), 2*time.Second); err != nil {
t.Fatalf("expected adaptive flux wait success, got %v", err)
}
})
t.Run("scaling-snapshot-branch-matrix", func(t *testing.T) {
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil {
t.Fatalf("expected empty snapshot restore success, got %v", err)
}
stateFile := filepath.Join(t.TempDir(), "state-file")
if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil {
t.Fatalf("write state-file: %v", err)
}
cfgWriteErr := lifecycleConfig(t)
cfgWriteErr.State.Dir = stateFile
orchWriteErr := cluster.New(cfgWriteErr, &execx.Runner{DryRun: false}, state.New(cfgWriteErr.State.RunHistoryPath), log.New(io.Discard, "", 0))
dispatch := lifecycleDispatcher(&commandRecorder{})
orchWriteErr.SetCommandOverrides(dispatch, dispatch)
if err := orchWriteErr.TestHookWriteScaledWorkloadSnapshot(nil); err == nil || !strings.Contains(err.Error(), "ensure state dir") {
t.Fatalf("expected scaled snapshot mkdir failure, got %v", err)
}
})
t.Run("report-sanitize-and-checklist-host-parsers", func(t *testing.T) {
got := cluster.TestHookSanitizeReportFileName(" Startup / Drill : Night#2 ")
if got == "" || strings.Contains(got, " ") || strings.Contains(got, "/") {
t.Fatalf("unexpected sanitized report filename: %q", got)
}
cfg := lifecycleConfig(t)
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
host := orch.TestHookChecklistFailureHost("metrics.bstein.dev: GET https://metrics.bstein.dev/: EOF")
if host != "metrics.bstein.dev" {
t.Fatalf("expected checklist failure host extraction, got %q", host)
}
})
}
// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
// Why: raises branch coverage on vault/key and coordination helpers without
// requiring package-local tests.
func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.VaultUnsealKeyFile = ""
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
if err := orch.TestHookWriteVaultUnsealKeyFile("abc"); err == nil || !strings.Contains(err.Error(), "path is empty") {
t.Fatalf("expected empty vault key path error")
}
if cluster.TestHookIsNotFoundErr("") {
t.Fatalf("expected nil/notfound helper false on empty input")
}
if !cluster.TestHookIsNotFoundErr("resource not found") {
t.Fatalf("expected notfound helper true for notfound text")
}
runPhase := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
return "Pending", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchPhase, _ := newHookOrchestrator(t, lifecycleConfig(t), runPhase, runPhase)
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
t.Fatalf("expected vault phase gate error, got %v", err)
}
})
t.Run("coordination-peers-and-snapshot-stat-error", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Coordination.PeerHosts = []string{" titan-24 ", "titan-24", " ", "titan-jh"}
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
peers := orch.TestHookCoordinationPeers()
if len(peers) != 2 || peers[0] != "titan-24" || peers[1] != "titan-jh" {
t.Fatalf("unexpected normalized peers: %v", peers)
}
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "ssh" && strings.Contains(command, "stat -c %s") {
return "", errors.New("stat failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchErr, _ := newHookOrchestrator(t, lifecycleConfig(t), run, run)
if err := orchErr.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/snap/path"); err == nil || !strings.Contains(err.Error(), "stat failed") {
t.Fatalf("expected snapshot stat error branch, got %v", err)
}
})
}
// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
// Why: expands low branch coverage in workload ignore helpers and startup-failure
// pod classification.
func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
t.Run("ignored-node-helper-matrix", func(t *testing.T) {
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
t.Fatalf("expected selector-host ignored match")
}
if cluster.TestHookWorkloadTargetsIgnoredNodes("titan-23", []string{"titan-24"}, []string{"titan-22"}) {
t.Fatalf("expected workload targets ignored false when no ignored host targeted")
}
if !cluster.TestHookWorkloadTargetsIgnoredNodes("", []string{"titan-22"}, []string{"titan-22"}) {
t.Fatalf("expected affinity host ignored match")
}
if !cluster.TestHookPodTargetsIgnoredNode("titan-22", []string{"titan-22"}) {
t.Fatalf("expected pod ignored-node match")
}
if cluster.TestHookPodTargetsIgnoredNode("titan-23", []string{"titan-22"}) {
t.Fatalf("expected pod ignored-node mismatch")
}
})
t.Run("startup-failure-pods-decode-error-and-success", func(t *testing.T) {
cfg := lifecycleConfig(t)
runBad := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return "{bad json", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchBad, _ := newHookOrchestrator(t, cfg, runBad, runBad)
if _, err := orchBad.TestHookStartupFailurePods(context.Background()); err == nil {
t.Fatalf("expected startupFailurePods decode error")
}
runOK := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
return `{"items":[{"metadata":{"namespace":"default","name":"ok-pod"},"status":{"containerStatuses":[{"state":{"running":{}}}]}}]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchOK, _ := newHookOrchestrator(t, cfg, runOK, runOK)
failures, err := orchOK.TestHookStartupFailurePods(context.Background())
if err != nil || len(failures) != 0 {
t.Fatalf("expected no startup failures, failures=%v err=%v", failures, err)
}
})
t.Run("stuck-vault-init-reason-matrix", func(t *testing.T) {
if got := cluster.TestHookStuckVaultInitReason("Running", true, 0, 10*time.Second); got != "" {
t.Fatalf("expected no stuck init reason without running init, got %q", got)
}
if got := cluster.TestHookStuckVaultInitReason("Pending", true, 30*time.Second, 10*time.Second); !strings.Contains(got, "VaultInitStuck") {
t.Fatalf("expected stuck vault init reason, got %q", got)
}
})
}