359 lines
17 KiB
Go
359 lines
17 KiB
Go
package orchestrator
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"io"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
|
"scm.bstein.dev/bstein/ananke/internal/config"
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
|
)
|
|
|
|
// TestHookGapMatrixPart2TimesyncAndStability runs one orchestration or CLI step.
|
|
// Signature: TestHookGapMatrixPart2TimesyncAndStability(t *testing.T).
|
|
// Why: drives low-coverage time-sync, datastore parsing, and startup stability
|
|
// branches from the top-level testing module.
|
|
func TestHookGapMatrixPart2TimesyncAndStability(t *testing.T) {
|
|
t.Run("parse-datastore-endpoint-matrix", func(t *testing.T) {
|
|
cases := []struct {
|
|
line string
|
|
want string
|
|
}{
|
|
{"ExecStart=/usr/local/bin/k3s server", ""},
|
|
{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://db:5432/k3s", "postgres://db:5432/k3s"},
|
|
{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint='postgres://db:5432/k3s' \\", "postgres://db:5432/k3s"},
|
|
{"ExecStart=/usr/local/bin/k3s server --datastore-endpoint = \"postgres://db:5432/k3s\" \\", "="},
|
|
{"X --datastore-endpoint= \"postgres://db:5432/k3s\" ", "postgres://db:5432/k3s"},
|
|
}
|
|
for _, tc := range cases {
|
|
got := cluster.TestHookParseDatastoreEndpoint(tc.line)
|
|
if got != tc.want {
|
|
t.Fatalf("parseDatastoreEndpoint(%q)=%q want %q", tc.line, got, tc.want)
|
|
}
|
|
}
|
|
})
|
|
|
|
t.Run("wait-for-time-sync-strict-timeout", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
cfg.Startup.TimeSyncMode = "strict"
|
|
cfg.Startup.TimeSyncWaitSeconds = 1
|
|
cfg.Startup.TimeSyncPollSeconds = 1
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") {
|
|
return "no", nil
|
|
}
|
|
if name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value") {
|
|
return "no", nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db"})
|
|
if err == nil || !strings.Contains(err.Error(), "time sync not ready") {
|
|
t.Fatalf("expected strict time-sync timeout branch, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("wait-for-time-sync-quorum-success", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
cfg.ControlPlanes = []string{"titan-db", "titan-23"}
|
|
cfg.Startup.TimeSyncMode = "quorum"
|
|
cfg.Startup.TimeSyncQuorum = 1
|
|
cfg.Startup.TimeSyncWaitSeconds = 2
|
|
cfg.Startup.TimeSyncPollSeconds = 1
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
|
|
return "yes", nil
|
|
case name == "ssh" && strings.Contains(command, "titan-db") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
|
|
return "yes", nil
|
|
case name == "ssh" && strings.Contains(command, "titan-23") && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"):
|
|
return "no", nil
|
|
default:
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil {
|
|
t.Fatalf("expected quorum time-sync success, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("startup-stability-failure-matrix", func(t *testing.T) {
|
|
baseCfg := lifecycleConfig(t)
|
|
baseCfg.Startup.RequireIngressChecklist = false
|
|
baseCfg.Startup.RequireServiceChecklist = false
|
|
baseCfg.Startup.RequireWorkloadConvergence = false
|
|
baseCfg.Startup.RequireFluxHealth = false
|
|
|
|
runPodsCrash := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
|
|
return `{"items":[{"metadata":{"namespace":"default","name":"bad-pod"},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}}]}`, nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchCrash, _ := newHookOrchestrator(t, baseCfg, runPodsCrash, runPodsCrash)
|
|
if err := orchCrash.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "pods in crash/image-pull failures") {
|
|
t.Fatalf("expected crashloop stability failure, got %v", err)
|
|
}
|
|
|
|
cfgFlux := baseCfg
|
|
cfgFlux.Startup.RequireFluxHealth = true
|
|
runFlux := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
|
|
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","message":"syncing"}]}}]}`, nil
|
|
}
|
|
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
|
|
return `{"items":[]}`, nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchFlux, _ := newHookOrchestrator(t, cfgFlux, runFlux, runFlux)
|
|
if err := orchFlux.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "flux not ready") {
|
|
t.Fatalf("expected flux-not-ready stability failure, got %v", err)
|
|
}
|
|
|
|
cfgWork := baseCfg
|
|
cfgWork.Startup.RequireWorkloadConvergence = true
|
|
runWork := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
|
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"default","name":"app"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil
|
|
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
|
return `{"items":[]}`, nil
|
|
default:
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
}
|
|
orchWork, _ := newHookOrchestrator(t, cfgWork, runWork, runWork)
|
|
if err := orchWork.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "workloads not converged") {
|
|
t.Fatalf("expected workload convergence stability failure, got %v", err)
|
|
}
|
|
|
|
cfgService := baseCfg
|
|
cfgService.Startup.RequireServiceChecklist = true
|
|
cfgService.Startup.ServiceChecklist = []config.ServiceChecklistCheck{
|
|
{Name: "api", URL: "http://127.0.0.1:1/health", AcceptedStatuses: []int{200}, TimeoutSeconds: 1},
|
|
}
|
|
runService := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
|
|
return `{"items":[]}`, nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchService, _ := newHookOrchestrator(t, cfgService, runService, runService)
|
|
if err := orchService.TestHookStartupStabilityHealthy(context.Background()); err == nil || !strings.Contains(err.Error(), "external services not healthy") {
|
|
t.Fatalf("expected service checklist stability failure, got %v", err)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestHookGapMatrixPart2FluxScalingReport runs one orchestration or CLI step.
|
|
// Signature: TestHookGapMatrixPart2FluxScalingReport(t *testing.T).
|
|
// Why: targets low branch density in flux-health, scaling snapshot handling,
|
|
// and report sanitization helpers.
|
|
func TestHookGapMatrixPart2FluxScalingReport(t *testing.T) {
|
|
t.Run("flux-helper-matrix", func(t *testing.T) {
|
|
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: FIELD IS IMMUTABLE") {
|
|
t.Fatalf("expected immutable matcher true for uppercase+job variant")
|
|
}
|
|
if cluster.TestHookLooksLikeImmutableJobError("totally unrelated error") {
|
|
t.Fatalf("expected immutable matcher false")
|
|
}
|
|
if !cluster.TestHookJobLooksFluxManaged("flux-system", "job-a", map[string]string{"kustomize.toolkit.fluxcd.io/name": "services"}, nil) {
|
|
t.Fatalf("expected flux-managed job by kustomize label")
|
|
}
|
|
if cluster.TestHookJobFailed(1, 0, []string{"Failed"}, []string{"True"}) != true {
|
|
t.Fatalf("expected Failed=True to mark job failed")
|
|
}
|
|
|
|
cfg := lifecycleConfig(t)
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
switch {
|
|
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
|
|
return `{"items":[{"metadata":{"namespace":"flux-system","name":"job-a","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}}]}`, nil
|
|
case name == "kubectl" && strings.Contains(command, "delete job -n flux-system job-a"):
|
|
return "", nil
|
|
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
|
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil
|
|
default:
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
}
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
|
healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
|
|
if err != nil || !healed {
|
|
t.Fatalf("expected immutable flux job heal success, healed=%t err=%v", healed, err)
|
|
}
|
|
if _, _, err := orch.TestHookAdaptiveFluxHealthWait(context.Background(), 2*time.Second); err != nil {
|
|
t.Fatalf("expected adaptive flux wait success, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("scaling-snapshot-branch-matrix", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
if err := orch.TestHookRestoreScaledApps(context.Background()); err != nil {
|
|
t.Fatalf("expected empty snapshot restore success, got %v", err)
|
|
}
|
|
|
|
stateFile := filepath.Join(t.TempDir(), "state-file")
|
|
if err := os.WriteFile(stateFile, []byte("x"), 0o644); err != nil {
|
|
t.Fatalf("write state-file: %v", err)
|
|
}
|
|
cfgWriteErr := lifecycleConfig(t)
|
|
cfgWriteErr.State.Dir = stateFile
|
|
orchWriteErr := cluster.New(cfgWriteErr, &execx.Runner{DryRun: false}, state.New(cfgWriteErr.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
|
dispatch := lifecycleDispatcher(&commandRecorder{})
|
|
orchWriteErr.SetCommandOverrides(dispatch, dispatch)
|
|
if err := orchWriteErr.TestHookWriteScaledWorkloadSnapshot(nil); err == nil || !strings.Contains(err.Error(), "ensure state dir") {
|
|
t.Fatalf("expected scaled snapshot mkdir failure, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("report-sanitize-and-checklist-host-parsers", func(t *testing.T) {
|
|
got := cluster.TestHookSanitizeReportFileName(" Startup / Drill : Night#2 ")
|
|
if got == "" || strings.Contains(got, " ") || strings.Contains(got, "/") {
|
|
t.Fatalf("unexpected sanitized report filename: %q", got)
|
|
}
|
|
|
|
cfg := lifecycleConfig(t)
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
host := orch.TestHookChecklistFailureHost("metrics.bstein.dev: GET https://metrics.bstein.dev/: EOF")
|
|
if host != "metrics.bstein.dev" {
|
|
t.Fatalf("expected checklist failure host extraction, got %q", host)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestHookGapMatrixPart2VaultAndCoordination runs one orchestration or CLI step.
|
|
// Signature: TestHookGapMatrixPart2VaultAndCoordination(t *testing.T).
|
|
// Why: raises branch coverage on vault/key and coordination helpers without
|
|
// requiring package-local tests.
|
|
func TestHookGapMatrixPart2VaultAndCoordination(t *testing.T) {
|
|
t.Run("vault-unseal-and-file-branches", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
cfg.Startup.VaultUnsealKeyFile = ""
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
if err := orch.TestHookWriteVaultUnsealKeyFile("abc"); err == nil || !strings.Contains(err.Error(), "path is empty") {
|
|
t.Fatalf("expected empty vault key path error")
|
|
}
|
|
if cluster.TestHookIsNotFoundErr("") {
|
|
t.Fatalf("expected nil/notfound helper false on empty input")
|
|
}
|
|
if !cluster.TestHookIsNotFoundErr("resource not found") {
|
|
t.Fatalf("expected notfound helper true for notfound text")
|
|
}
|
|
|
|
runPhase := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}") {
|
|
return "Pending", nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchPhase, _ := newHookOrchestrator(t, lifecycleConfig(t), runPhase, runPhase)
|
|
if err := orchPhase.TestHookEnsureVaultUnsealed(context.Background()); err == nil || !strings.Contains(err.Error(), "pod phase") {
|
|
t.Fatalf("expected vault phase gate error, got %v", err)
|
|
}
|
|
})
|
|
|
|
t.Run("coordination-peers-and-snapshot-stat-error", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
cfg.Coordination.PeerHosts = []string{" titan-24 ", "titan-24", " ", "titan-jh"}
|
|
orch, _ := newHookOrchestrator(t, cfg, nil, nil)
|
|
peers := orch.TestHookCoordinationPeers()
|
|
if len(peers) != 2 || peers[0] != "titan-24" || peers[1] != "titan-jh" {
|
|
t.Fatalf("unexpected normalized peers: %v", peers)
|
|
}
|
|
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "ssh" && strings.Contains(command, "stat -c %s") {
|
|
return "", errors.New("stat failed")
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchErr, _ := newHookOrchestrator(t, lifecycleConfig(t), run, run)
|
|
if err := orchErr.TestHookVerifyEtcdSnapshot(context.Background(), "titan-db", "/snap/path"); err == nil || !strings.Contains(err.Error(), "stat failed") {
|
|
t.Fatalf("expected snapshot stat error branch, got %v", err)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestHookGapMatrixPart2WorkloadIgnore runs one orchestration or CLI step.
|
|
// Signature: TestHookGapMatrixPart2WorkloadIgnore(t *testing.T).
|
|
// Why: expands low branch coverage in workload ignore helpers and startup-failure
|
|
// pod classification.
|
|
func TestHookGapMatrixPart2WorkloadIgnore(t *testing.T) {
|
|
t.Run("ignored-node-helper-matrix", func(t *testing.T) {
|
|
if !cluster.TestHookWorkloadTargetsIgnoredNodes("titan-22", nil, []string{"titan-22"}) {
|
|
t.Fatalf("expected selector-host ignored match")
|
|
}
|
|
if cluster.TestHookWorkloadTargetsIgnoredNodes("titan-23", []string{"titan-24"}, []string{"titan-22"}) {
|
|
t.Fatalf("expected workload targets ignored false when no ignored host targeted")
|
|
}
|
|
if !cluster.TestHookWorkloadTargetsIgnoredNodes("", []string{"titan-22"}, []string{"titan-22"}) {
|
|
t.Fatalf("expected affinity host ignored match")
|
|
}
|
|
if !cluster.TestHookPodTargetsIgnoredNode("titan-22", []string{"titan-22"}) {
|
|
t.Fatalf("expected pod ignored-node match")
|
|
}
|
|
if cluster.TestHookPodTargetsIgnoredNode("titan-23", []string{"titan-22"}) {
|
|
t.Fatalf("expected pod ignored-node mismatch")
|
|
}
|
|
})
|
|
|
|
t.Run("startup-failure-pods-decode-error-and-success", func(t *testing.T) {
|
|
cfg := lifecycleConfig(t)
|
|
runBad := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
|
|
return "{bad json", nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchBad, _ := newHookOrchestrator(t, cfg, runBad, runBad)
|
|
if _, err := orchBad.TestHookStartupFailurePods(context.Background()); err == nil {
|
|
t.Fatalf("expected startupFailurePods decode error")
|
|
}
|
|
|
|
runOK := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
|
command := name + " " + strings.Join(args, " ")
|
|
if name == "kubectl" && strings.Contains(command, "get pods -A -o json") {
|
|
return `{"items":[{"metadata":{"namespace":"default","name":"ok-pod"},"status":{"containerStatuses":[{"state":{"running":{}}}]}}]}`, nil
|
|
}
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
|
}
|
|
orchOK, _ := newHookOrchestrator(t, cfg, runOK, runOK)
|
|
failures, err := orchOK.TestHookStartupFailurePods(context.Background())
|
|
if err != nil || len(failures) != 0 {
|
|
t.Fatalf("expected no startup failures, failures=%v err=%v", failures, err)
|
|
}
|
|
})
|
|
|
|
t.Run("stuck-vault-init-reason-matrix", func(t *testing.T) {
|
|
if got := cluster.TestHookStuckVaultInitReason("Running", true, 0, 10*time.Second); got != "" {
|
|
t.Fatalf("expected no stuck init reason without running init, got %q", got)
|
|
}
|
|
if got := cluster.TestHookStuckVaultInitReason("Pending", true, 30*time.Second, 10*time.Second); !strings.Contains(got, "VaultInitStuck") {
|
|
t.Fatalf("expected stuck vault init reason, got %q", got)
|
|
}
|
|
})
|
|
}
|