539 lines
30 KiB
Go
539 lines
30 KiB
Go
|
|
package orchestrator
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"errors"
|
||
|
|
"io"
|
||
|
|
"log"
|
||
|
|
"os"
|
||
|
|
"path/filepath"
|
||
|
|
"strings"
|
||
|
|
"testing"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/cluster"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/execx"
|
||
|
|
"scm.bstein.dev/bstein/ananke/internal/state"
|
||
|
|
)
|
||
|
|
|
||
|
|
// TestHookLowFileCoverageBoost runs one orchestration or CLI step.
|
||
|
|
// Signature: TestHookLowFileCoverageBoost(t *testing.T).
|
||
|
|
// Why: raises the low-coverage orchestrator files through deterministic top-level
|
||
|
|
// tests that only use the exported hook surface.
|
||
|
|
func TestHookLowFileCoverageBoost(t *testing.T) {
|
||
|
|
t.Run("workload-convergence-helpers-and-gates", func(t *testing.T) {
|
||
|
|
desiredCases := []struct {
|
||
|
|
kind string
|
||
|
|
has bool
|
||
|
|
rep int32
|
||
|
|
ready int32
|
||
|
|
sched int32
|
||
|
|
num int32
|
||
|
|
wantD int32
|
||
|
|
wantR int32
|
||
|
|
wantB bool
|
||
|
|
}{
|
||
|
|
{kind: "Deployment", has: false, ready: 1, wantD: 1, wantR: 1, wantB: true},
|
||
|
|
{kind: "deployment", has: true, rep: 3, ready: 2, wantD: 3, wantR: 2, wantB: true},
|
||
|
|
{kind: "daemonset", sched: 4, num: 3, wantD: 4, wantR: 3, wantB: true},
|
||
|
|
{kind: "job", wantD: 0, wantR: 0, wantB: false},
|
||
|
|
}
|
||
|
|
for _, tc := range desiredCases {
|
||
|
|
gotD, gotR, gotB := cluster.TestHookDesiredReady(tc.kind, tc.has, tc.rep, tc.ready, tc.sched, tc.num)
|
||
|
|
if gotD != tc.wantD || gotR != tc.wantR || gotB != tc.wantB {
|
||
|
|
t.Fatalf("desiredReady(%q)=%d,%d,%v want %d,%d,%v", tc.kind, gotD, gotR, gotB, tc.wantD, tc.wantR, tc.wantB)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if !cluster.TestHookPodControllerOwned([]string{"ReplicaSet"}) {
|
||
|
|
t.Fatalf("expected ReplicaSet owner to be controller-owned")
|
||
|
|
}
|
||
|
|
if !cluster.TestHookPodControllerOwned([]string{"StatefulSet"}) {
|
||
|
|
t.Fatalf("expected StatefulSet owner to be controller-owned")
|
||
|
|
}
|
||
|
|
if !cluster.TestHookPodControllerOwned([]string{"DaemonSet"}) {
|
||
|
|
t.Fatalf("expected DaemonSet owner to be controller-owned")
|
||
|
|
}
|
||
|
|
if cluster.TestHookPodControllerOwned([]string{"Job"}) {
|
||
|
|
t.Fatalf("expected Job owner to be non controller-owned")
|
||
|
|
}
|
||
|
|
|
||
|
|
if got := cluster.TestHookStuckContainerReason([]string{"ImagePullBackOff"}, nil, []string{"ImagePullBackOff"}); got != "ImagePullBackOff" {
|
||
|
|
t.Fatalf("expected init-container stuck reason, got %q", got)
|
||
|
|
}
|
||
|
|
if got := cluster.TestHookStuckContainerReason(nil, []string{"CrashLoopBackOff"}, []string{"CrashLoopBackOff"}); got != "CrashLoopBackOff" {
|
||
|
|
t.Fatalf("expected container stuck reason, got %q", got)
|
||
|
|
}
|
||
|
|
if got := cluster.TestHookStuckContainerReason([]string{"ImagePullBackOff"}, []string{"CrashLoopBackOff"}, []string{"Missing"}); got != "" {
|
||
|
|
t.Fatalf("expected filtered stuck reason to be empty, got %q", got)
|
||
|
|
}
|
||
|
|
|
||
|
|
vaultCases := []struct {
|
||
|
|
name string
|
||
|
|
phase string
|
||
|
|
inject bool
|
||
|
|
startedAgo time.Duration
|
||
|
|
grace time.Duration
|
||
|
|
want string
|
||
|
|
}{
|
||
|
|
{name: "phase-running", phase: "Running", inject: true, startedAgo: 10 * time.Minute, grace: time.Minute, want: ""},
|
||
|
|
{name: "inject-false", phase: "Pending", inject: false, startedAgo: 10 * time.Minute, grace: time.Minute, want: ""},
|
||
|
|
{name: "within-grace", phase: "Pending", inject: true, startedAgo: 30 * time.Second, grace: time.Minute, want: ""},
|
||
|
|
{name: "stuck", phase: "Pending", inject: true, startedAgo: 10 * time.Minute, grace: time.Minute, want: "VaultInitStuck"},
|
||
|
|
}
|
||
|
|
for _, tc := range vaultCases {
|
||
|
|
got := cluster.TestHookStuckVaultInitReason(tc.phase, tc.inject, tc.startedAgo, tc.grace)
|
||
|
|
if got != tc.want {
|
||
|
|
t.Fatalf("%s: stuckVaultInitReason=%q want %q", tc.name, got, tc.want)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||
|
|
cfg.Startup.WorkloadConvergencePollSeconds = 1
|
||
|
|
cfg.Startup.StuckPodGraceSeconds = 1
|
||
|
|
cfg.Startup.IgnoreWorkloadNamespaces = []string{"ignored-ns"}
|
||
|
|
cfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
|
||
|
|
cfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
|
||
|
|
cfg.Startup.IgnoreFluxKustomizations = []string{"ignored/flux-system"}
|
||
|
|
|
||
|
|
readyRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||
|
|
return `{"items":[
|
||
|
|
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"titan-23"}}}},"status":{"readyReplicas":1}},
|
||
|
|
{"kind":"DaemonSet","metadata":{"namespace":"monitoring","name":"node-exporter"},"spec":{"template":{"spec":{"nodeName":"titan-23"}}},"status":{"desiredNumberScheduled":2,"numberReady":1}},
|
||
|
|
{"kind":"Deployment","metadata":{"namespace":"ignored-ns","name":"skip"},"spec":{"replicas":1},"status":{"readyReplicas":0}},
|
||
|
|
{"kind":"Deployment","metadata":{"namespace":"flux-system","name":"ignored"},"spec":{"replicas":1},"status":{"readyReplicas":0}},
|
||
|
|
{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"ignore-me"},"spec":{"replicas":1},"status":{"readyReplicas":0}}
|
||
|
|
]}`, nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchReady, _ := newHookOrchestrator(t, cfg, readyRun, readyRun)
|
||
|
|
ready, detail, err := orchReady.TestHookWorkloadConvergenceReady(context.Background())
|
||
|
|
if err != nil || !ready || !strings.Contains(detail, "controllers ready=") {
|
||
|
|
t.Fatalf("expected workload convergence ready path, ready=%v detail=%q err=%v", ready, detail, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
pendingRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||
|
|
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||
|
|
return `{"items":[]}`, nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchPending, _ := newHookOrchestrator(t, cfg, pendingRun, pendingRun)
|
||
|
|
if err := orchPending.TestHookWaitForWorkloadConvergence(context.Background()); err == nil || !strings.Contains(err.Error(), "workload convergence not satisfied") {
|
||
|
|
t.Fatalf("expected workload convergence timeout, got %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
podRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||
|
|
return `{"items":[
|
||
|
|
{"metadata":{"namespace":"","name":"missing-ns"}},
|
||
|
|
{"metadata":{"namespace":"kube-system","name":"ignored","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"ignored"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"c"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}},
|
||
|
|
{"metadata":{"namespace":"monitoring","name":"ignore-me","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"ignore-me"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"c"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}},
|
||
|
|
{"metadata":{"namespace":"monitoring","name":"node-ignored","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"node-ignored"}]},"spec":{"nodeName":"titan-22","containers":[{"name":"c"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}},
|
||
|
|
{"metadata":{"namespace":"monitoring","name":"unowned","creationTimestamp":"2020-01-01T00:00:00Z"},"spec":{"nodeName":"titan-23","containers":[{"name":"c"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}},
|
||
|
|
{"metadata":{"namespace":"monitoring","name":"recent","creationTimestamp":"` + time.Now().UTC().Format(time.RFC3339) + `","ownerReferences":[{"kind":"ReplicaSet","name":"recent"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"c"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}},
|
||
|
|
{"metadata":{"namespace":"monitoring","name":"grafana-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"ReplicaSet","name":"grafana"}]},"spec":{"nodeName":"titan-23","containers":[{"name":"c"}]},"status":{"containerStatuses":[{"state":{"waiting":{"reason":"CrashLoopBackOff"}}}]}},
|
||
|
|
{"metadata":{"namespace":"vault","name":"vault-0","creationTimestamp":"2020-01-01T00:00:00Z","ownerReferences":[{"kind":"StatefulSet","name":"vault"}],"annotations":{"vault.hashicorp.com/agent-inject":"true"}},"spec":{"nodeName":"titan-23","containers":[{"name":"vault"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"vault-agent-init","state":{"running":{"startedAt":"2020-01-01T00:00:00Z"}}}]}}
|
||
|
|
]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete pod grafana-0"):
|
||
|
|
return "", errors.New("boom")
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete pod vault-0"):
|
||
|
|
return "", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchPods, _ := newHookOrchestrator(t, cfg, podRun, podRun)
|
||
|
|
if err := orchPods.TestHookRecycleStuckControllerPods(context.Background()); err != nil {
|
||
|
|
t.Fatalf("expected recycleStuckControllerPods best-effort success, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("scaling-helpers-and-recovery", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.ExcludedNamespaces = []string{"flux-system", "vault"}
|
||
|
|
|
||
|
|
explicit := []string{"worker-a", "worker-b"}
|
||
|
|
cfg.Workers = append([]string{}, explicit...)
|
||
|
|
orchWorkers, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||
|
|
gotWorkers, err := orchWorkers.TestHookEffectiveWorkers(context.Background())
|
||
|
|
if err != nil || len(gotWorkers) != len(explicit) || gotWorkers[0] != explicit[0] || gotWorkers[1] != explicit[1] {
|
||
|
|
t.Fatalf("expected explicit workers copy, got %v err=%v", gotWorkers, err)
|
||
|
|
}
|
||
|
|
gotWorkers[0] = "mutated"
|
||
|
|
if cfg.Workers[0] != explicit[0] {
|
||
|
|
t.Fatalf("expected effectiveWorkers to return a copy")
|
||
|
|
}
|
||
|
|
|
||
|
|
cfg.Workers = nil
|
||
|
|
cfg.SSHManagedNodes = nil
|
||
|
|
cfg.SSHNodeHosts = map[string]string{
|
||
|
|
"worker-c": "worker-c",
|
||
|
|
"worker-b": "worker-b",
|
||
|
|
"worker-a": "worker-a",
|
||
|
|
"titan-db": "titan-db",
|
||
|
|
}
|
||
|
|
discoverErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "get nodes -o custom-columns=") {
|
||
|
|
return "", errors.New("nodes unavailable")
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchFallback, _ := newHookOrchestrator(t, cfg, discoverErrRun, discoverErrRun)
|
||
|
|
fallbackWorkers, err := orchFallback.TestHookEffectiveWorkers(context.Background())
|
||
|
|
if err != nil || strings.Join(fallbackWorkers, ",") != "worker-a,worker-b,worker-c" {
|
||
|
|
t.Fatalf("expected fallback workers, got %v err=%v", fallbackWorkers, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
discoverRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get nodes -o custom-columns="):
|
||
|
|
return "titan-db <none> <none>\nworker-b <none> <none>\nworker-c control-plane <none>\nbadline\n", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchDiscover, _ := newHookOrchestrator(t, lifecycleConfig(t), discoverRun, discoverRun)
|
||
|
|
discovered, err := orchDiscover.TestHookDiscoverWorkers(context.Background())
|
||
|
|
if err != nil || strings.Join(discovered, ",") != "titan-db,worker-b" {
|
||
|
|
t.Fatalf("expected discovered workers, got %v err=%v", discovered, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
cfgPatch := lifecycleConfig(t)
|
||
|
|
patchRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system get kustomizations.kustomize.toolkit.fluxcd.io -o jsonpath="):
|
||
|
|
return "services\nignored\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get helmreleases.helm.toolkit.fluxcd.io -A -o jsonpath="):
|
||
|
|
return "monitoring/grafana\nmonitoring/failing\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system patch kustomization services"):
|
||
|
|
return "", errors.New("patch failed")
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system patch kustomization ignored"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring patch helmrelease grafana"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring patch helmrelease failing"):
|
||
|
|
return "", errors.New("patch failed")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchPatch, _ := newHookOrchestrator(t, cfgPatch, patchRun, patchRun)
|
||
|
|
if err := orchPatch.TestHookPatchFluxSuspendAll(context.Background(), true); err != nil {
|
||
|
|
t.Fatalf("expected patchFluxSuspendAll best-effort success, got %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
listRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
|
||
|
|
return "monitoring\tgrafana\t2\nflux-system\tsource-controller\t1\nmonitoring\tbad\tbogus\nmonitoring\tempty\t0\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
|
||
|
|
return "monitoring\tvictoria-metrics-single-server\t3\nvault\tvault\t1\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "scale deployment grafana --replicas=0"):
|
||
|
|
return "", errors.New("scale failed")
|
||
|
|
case name == "kubectl" && strings.Contains(command, "scale deployment grafana --replicas=1"):
|
||
|
|
return "", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchList, _ := newHookOrchestrator(t, cfgPatch, listRun, listRun)
|
||
|
|
entries, err := orchList.TestHookListScalableWorkloads(context.Background())
|
||
|
|
if err != nil || len(entries) != 4 {
|
||
|
|
t.Fatalf("expected four scalable workloads, got %v err=%v", entries, err)
|
||
|
|
}
|
||
|
|
if err := orchList.TestHookScaleWorkloads(context.Background(), entries[:1], 0, 0); err != nil {
|
||
|
|
t.Fatalf("expected single-entry scaleWorkloads success, got %v", err)
|
||
|
|
}
|
||
|
|
if err := orchList.TestHookScaleWorkloads(context.Background(), entries[1:], 0, 0); err == nil || !strings.Contains(err.Error(), "scaling had") {
|
||
|
|
t.Fatalf("expected scaleWorkloads error aggregation, got %v", err)
|
||
|
|
}
|
||
|
|
if err := orchList.TestHookScaleWorkloads(context.Background(), nil, 0, 1); err != nil {
|
||
|
|
t.Fatalf("expected empty scaleWorkloads success, got %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
orchSnapshotWrite, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
|
||
|
|
if err := orchSnapshotWrite.TestHookWriteScaledWorkloadSnapshot(nil); err != nil {
|
||
|
|
t.Fatalf("expected snapshot write with empty entries, got %v", err)
|
||
|
|
}
|
||
|
|
orchSnapshotRead, _ := newHookOrchestrator(t, lifecycleConfig(t), nil, nil)
|
||
|
|
if snapshot, err := orchSnapshotRead.TestHookReadScaledWorkloadSnapshot(); err != nil || snapshot != nil {
|
||
|
|
t.Fatalf("expected missing snapshot to read as nil,nil, got snapshot=%v err=%v", snapshot, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
manualCfg := lifecycleConfig(t)
|
||
|
|
manualCfg.State.Dir = filepath.Join(t.TempDir(), "state")
|
||
|
|
manualOrch := cluster.New(manualCfg, &execx.Runner{DryRun: true}, state.New(manualCfg.State.RunHistoryPath), log.New(io.Discard, "", 0))
|
||
|
|
if err := manualOrch.TestHookWriteScaledWorkloadSnapshot(nil); err != nil {
|
||
|
|
t.Fatalf("expected dry-run snapshot write success, got %v", err)
|
||
|
|
}
|
||
|
|
if snapshot, err := manualOrch.TestHookReadScaledWorkloadSnapshot(); err != nil || snapshot != nil {
|
||
|
|
t.Fatalf("expected dry-run snapshot read to return nil,nil, got snapshot=%v err=%v", snapshot, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
restorePath := filepath.Join(t.TempDir(), "state", "scaled-workloads.json")
|
||
|
|
if err := os.MkdirAll(filepath.Dir(restorePath), 0o755); err != nil {
|
||
|
|
t.Fatalf("mkdir restore path: %v", err)
|
||
|
|
}
|
||
|
|
valid := `{"generated_at":"2026-01-01T00:00:00Z","entries":[{"namespace":"monitoring","kind":"deployment","name":"grafana","replicas":1}]}`
|
||
|
|
if err := os.WriteFile(restorePath, []byte(valid), 0o600); err != nil {
|
||
|
|
t.Fatalf("write restore snapshot: %v", err)
|
||
|
|
}
|
||
|
|
restoreCfg := lifecycleConfig(t)
|
||
|
|
restoreCfg.State.Dir = filepath.Dir(restorePath)
|
||
|
|
restoreOrch, _ := newHookOrchestrator(t, restoreCfg, listRun, listRun)
|
||
|
|
if err := restoreOrch.TestHookRestoreScaledApps(context.Background()); err != nil {
|
||
|
|
t.Fatalf("expected restoreScaledApps success, got %v", err)
|
||
|
|
}
|
||
|
|
if _, err := os.Stat(restorePath); !os.IsNotExist(err) {
|
||
|
|
t.Fatalf("expected restore snapshot to be removed, stat err=%v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
pendingCfg := lifecycleConfig(t)
|
||
|
|
pendingCfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||
|
|
pendingCfg.Startup.WorkloadConvergencePollSeconds = 1
|
||
|
|
pendingRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deploy,statefulset,daemonset -A -o json"):
|
||
|
|
return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1},"status":{"readyReplicas":0}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get pods -A -o json"):
|
||
|
|
return `{"items":[]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "scale deployment grafana --replicas=0"):
|
||
|
|
return "", errors.New("scale failed")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchPendingScale, _ := newHookOrchestrator(t, pendingCfg, pendingRun, pendingRun)
|
||
|
|
if err := orchPendingScale.TestHookScaleDownApps(context.Background()); err == nil {
|
||
|
|
t.Fatalf("expected scaleDownApps to fail when workloads stay pending")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("critical-vault-and-flux-health-helpers", func(t *testing.T) {
|
||
|
|
fluxCfg := lifecycleConfig(t)
|
||
|
|
fluxCfg.Startup.FluxHealthWaitSeconds = 1
|
||
|
|
fluxCfg.Startup.FluxHealthPollSeconds = 1
|
||
|
|
fluxCfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/ignored"}
|
||
|
|
|
||
|
|
fluxRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||
|
|
return `{"items":[
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false,"timeout":"40s"},"status":{"conditions":[{"type":"Ready","status":"False","message":"syncing"}]}},
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"no-condition"},"spec":{"suspend":false,"timeout":"10s"},"status":{"conditions":[{"type":"Reconciling","status":"True","message":"still"}]}},
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"ignored"},"spec":{"suspend":false,"timeout":"5s"},"status":{"conditions":[{"type":"Ready","status":"False","message":"ignored"}]}},
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"suspended"},"spec":{"suspend":true,"timeout":"2m"},"status":{"conditions":[{"type":"Ready","status":"False","message":"skip"}]}},
|
||
|
|
{"metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"suspend":false,"timeout":"5m"},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}
|
||
|
|
]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
|
||
|
|
return `{"items":[
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"job-a","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}},
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"job-b","labels":{"kustomize.toolkit.fluxcd.io/name":"services"}},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}},
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"cronjob-owned","ownerReferences":[{"kind":"CronJob","name":"cron"}]},"status":{"failed":1,"conditions":[{"type":"Failed","status":"True"}]}},
|
||
|
|
{"metadata":{"namespace":"flux-system","name":"succeeded"},"status":{"succeeded":1,"failed":1,"conditions":[{"type":"Failed","status":"True"}]}}
|
||
|
|
]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete job job-a"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete job job-b"):
|
||
|
|
return "", errors.New("boom")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchFlux, _ := newHookOrchestrator(t, fluxCfg, fluxRun, fluxRun)
|
||
|
|
wait, reason, err := orchFlux.TestHookAdaptiveFluxHealthWait(context.Background(), 30*time.Second)
|
||
|
|
if err != nil || wait <= 30*time.Second || !strings.Contains(reason, "max flux timeout") {
|
||
|
|
t.Fatalf("expected adaptive flux wait extension, wait=%s reason=%q err=%v", wait, reason, err)
|
||
|
|
}
|
||
|
|
noTimeoutCfg := lifecycleConfig(t)
|
||
|
|
noTimeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||
|
|
return `{"items":[{"metadata":{"namespace":"flux-system","name":"ready"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","message":"ok"}]}}]}`, nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchNoTimeout, _ := newHookOrchestrator(t, noTimeoutCfg, noTimeoutRun, noTimeoutRun)
|
||
|
|
if wait, reason, err := orchNoTimeout.TestHookAdaptiveFluxHealthWait(context.Background(), 0); err != nil || wait < 15*time.Minute || !strings.Contains(reason, "no explicit kustomization timeouts found") {
|
||
|
|
t.Fatalf("expected adaptive wait fallback, wait=%s reason=%q err=%v", wait, reason, err)
|
||
|
|
}
|
||
|
|
if ready, detail, err := orchFlux.TestHookFluxHealthReady(context.Background()); err != nil || ready || !strings.Contains(detail, "not ready") {
|
||
|
|
t.Fatalf("expected flux health not-ready result, ready=%v detail=%q err=%v", ready, detail, err)
|
||
|
|
}
|
||
|
|
if ready, detail, err := orchNoTimeout.TestHookFluxHealthReady(context.Background()); err != nil || !ready || !strings.Contains(detail, "all kustomizations ready=") {
|
||
|
|
t.Fatalf("expected flux health ready result, ready=%v detail=%q err=%v", ready, detail, err)
|
||
|
|
}
|
||
|
|
if !cluster.TestHookLooksLikeImmutableJobError("Job update failed: field is immutable") {
|
||
|
|
t.Fatalf("expected immutable-job detector true")
|
||
|
|
}
|
||
|
|
if cluster.TestHookLooksLikeImmutableJobError("") {
|
||
|
|
t.Fatalf("expected empty immutable-job detail to be false")
|
||
|
|
}
|
||
|
|
if !cluster.TestHookJobLooksFluxManaged("flux-system", "job-a", map[string]string{"kustomize.toolkit.fluxcd.io/name": "services"}, nil) {
|
||
|
|
t.Fatalf("expected label-based flux-managed job")
|
||
|
|
}
|
||
|
|
if cluster.TestHookJobLooksFluxManaged("flux-system", "cronjob-owned", nil, []string{"CronJob"}) {
|
||
|
|
t.Fatalf("expected CronJob-owned job to be non flux-managed")
|
||
|
|
}
|
||
|
|
if !cluster.TestHookJobFailed(1, 0, []string{"Failed"}, []string{"True"}) {
|
||
|
|
t.Fatalf("expected failed job detector to be true")
|
||
|
|
}
|
||
|
|
if cluster.TestHookJobFailed(0, 1, []string{"Complete"}, []string{"True"}) {
|
||
|
|
t.Fatalf("expected succeeded job to be false")
|
||
|
|
}
|
||
|
|
if healed, err := orchFlux.TestHookHealImmutableFluxJobs(context.Background()); err != nil || !healed {
|
||
|
|
t.Fatalf("expected immutable job heal success, healed=%v err=%v", healed, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
critCfg := lifecycleConfig(t)
|
||
|
|
critCfg.Startup.VaultUnsealKeyFile = filepath.Join(t.TempDir(), "vault", "unseal.key")
|
||
|
|
critCfg.Startup.VaultUnsealBreakglassCommand = "echo breakglass-key"
|
||
|
|
critCfg.Startup.VaultUnsealBreakglassTimeout = 1
|
||
|
|
critCfg.Startup.WorkloadConvergenceWaitSeconds = 1
|
||
|
|
critCfg.Startup.WorkloadConvergencePollSeconds = 1
|
||
|
|
critCfg.Startup.StuckPodGraceSeconds = 1
|
||
|
|
critCfg.Startup.IgnoreWorkloadNamespaces = []string{"kube-system"}
|
||
|
|
critCfg.Startup.IgnoreUnavailableNodes = []string{"titan-22"}
|
||
|
|
critCfg.Startup.IgnoreWorkloads = []string{"monitoring/deployment/ignore-me"}
|
||
|
|
|
||
|
|
critRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system get deployment source-controller -o jsonpath={.status.readyReplicas}"):
|
||
|
|
return "", errors.New("boom")
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system scale deployment source-controller --replicas=1"):
|
||
|
|
return "", errors.New("boom")
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n vault get pods -o custom-columns="):
|
||
|
|
return "vault-0 Pending StatefulSet vault\nvault-1 Unknown StatefulSet vault\nvault-2 Running StatefulSet vault\nvault-other Failed Deployment vault\nbadline\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete pod vault-0"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "delete pod vault-1"):
|
||
|
|
return "", errors.New("boom")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchCrit, _ := newHookOrchestrator(t, critCfg, critRun, critRun)
|
||
|
|
if missing, err := orchCrit.TestHookMissingCriticalStartupWorkloads(context.Background()); err == nil || len(missing) != 0 {
|
||
|
|
t.Fatalf("expected missingCriticalStartupWorkloads generic error, missing=%v err=%v", missing, err)
|
||
|
|
}
|
||
|
|
if err := orchCrit.TestHookEnsureCriticalStartupWorkloads(context.Background()); err == nil || !strings.Contains(err.Error(), "scale") {
|
||
|
|
t.Fatalf("expected ensureCriticalStartupWorkloads scale error, got %v", err)
|
||
|
|
}
|
||
|
|
if err := orchCrit.TestHookCleanupStaleCriticalWorkloadPods(context.Background(), "vault", "statefulset", "vault"); err == nil {
|
||
|
|
t.Fatalf("expected stale critical workload cleanup error branch")
|
||
|
|
}
|
||
|
|
|
||
|
|
vaultCfg := lifecycleConfig(t)
|
||
|
|
vaultCfg.Startup.VaultUnsealKeyFile = filepath.Join(t.TempDir(), "vault", "unseal.key")
|
||
|
|
vaultCfg.Startup.VaultUnsealBreakglassCommand = "echo breakglass-key"
|
||
|
|
vaultCfg.Startup.VaultUnsealBreakglassTimeout = 1
|
||
|
|
ensureUnsealed := false
|
||
|
|
ensureRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||
|
|
return "Running", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||
|
|
if ensureUnsealed {
|
||
|
|
return `{"sealed":false}`, nil
|
||
|
|
}
|
||
|
|
return `{"sealed":true}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get secret vault-init"):
|
||
|
|
return "dmF1bHQta2V5", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "vault operator unseal"):
|
||
|
|
ensureUnsealed = true
|
||
|
|
return "", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchVaultEnsure, _ := newHookOrchestrator(t, vaultCfg, ensureRun, ensureRun)
|
||
|
|
if err := orchVaultEnsure.TestHookEnsureVaultUnsealed(context.Background()); err != nil {
|
||
|
|
t.Fatalf("expected vault auto-unseal success path, got %v", err)
|
||
|
|
}
|
||
|
|
if sealed, err := orchVaultEnsure.TestHookVaultSealed(context.Background()); err != nil || sealed {
|
||
|
|
t.Fatalf("expected vault sealed helper false after unseal, sealed=%v err=%v", sealed, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
waitReady := false
|
||
|
|
waitUnsealed := false
|
||
|
|
waitRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n vault get pod vault-0 -o jsonpath={.status.phase}"):
|
||
|
|
return "Running", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "vault status -format=json"):
|
||
|
|
if waitUnsealed {
|
||
|
|
return `{"sealed":false}`, nil
|
||
|
|
}
|
||
|
|
return `{"sealed":true}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get secret vault-init"):
|
||
|
|
return "dmF1bHQta2V5", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "vault operator unseal"):
|
||
|
|
waitUnsealed = true
|
||
|
|
waitReady = true
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get statefulset vault -o jsonpath={.status.readyReplicas}"):
|
||
|
|
if waitReady {
|
||
|
|
return "1", nil
|
||
|
|
}
|
||
|
|
return "0", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="):
|
||
|
|
return "monitoring\tgrafana\t1\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="):
|
||
|
|
return "monitoring\tvictoria-metrics-single-server\t1\n", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchVaultWait, _ := newHookOrchestrator(t, vaultCfg, waitRun, waitRun)
|
||
|
|
if err := orchVaultWait.TestHookWaitVaultReady(context.Background(), "vault", "statefulset", "vault"); err != nil {
|
||
|
|
t.Fatalf("expected vault wait success path, got %v", err)
|
||
|
|
}
|
||
|
|
if err := orchVaultWait.TestHookWaitWorkloadReady(context.Background(), "monitoring", "deployment", "grafana"); err != nil {
|
||
|
|
t.Fatalf("expected generic workload wait success, got %v", err)
|
||
|
|
}
|
||
|
|
if err := orchVaultWait.TestHookWriteVaultUnsealKeyFile("cached-key"); err != nil {
|
||
|
|
t.Fatalf("expected vault key file write success, got %v", err)
|
||
|
|
}
|
||
|
|
if got, err := orchVaultWait.TestHookReadVaultUnsealKeyFile(); err != nil || got != "cached-key" {
|
||
|
|
t.Fatalf("expected vault key file read success, got %q err=%v", got, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
blockedDir := t.TempDir()
|
||
|
|
blockedFile := filepath.Join(blockedDir, "blocked")
|
||
|
|
if err := os.WriteFile(blockedFile, []byte("x"), 0o600); err != nil {
|
||
|
|
t.Fatalf("write blocked file: %v", err)
|
||
|
|
}
|
||
|
|
blockedCfg := lifecycleConfig(t)
|
||
|
|
blockedCfg.Startup.VaultUnsealKeyFile = filepath.Join(blockedFile, "vault.key")
|
||
|
|
blockedOrch, _ := newHookOrchestrator(t, blockedCfg, nil, nil)
|
||
|
|
if err := blockedOrch.TestHookWriteVaultUnsealKeyFile("x"); err == nil {
|
||
|
|
t.Fatalf("expected vault key dir error")
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|