package orchestrator import ( "context" "encoding/json" "errors" "fmt" "strings" "testing" "time" ) // TestHookStorageFailureBranches runs one orchestration or CLI step. // Signature: TestHookStorageFailureBranches(t *testing.T). // Why: validates storage readiness edge paths so startup does not mark success while // Longhorn/PVC dependencies are still degraded. func TestHookStorageFailureBranches(t *testing.T) { t.Run("storage-ready-branch-matrix", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.StorageMinReadyNodes = 2 cfg.Startup.StorageCriticalPVCs = []string{} queryErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") { return "", fmt.Errorf("query failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchQueryErr, _ := newHookOrchestrator(t, cfg, queryErrRun, queryErrRun) if _, _, err := orchQueryErr.TestHookStorageReady(context.Background()); err == nil { t.Fatalf("expected longhorn query error branch") } insufficientRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") { return "titan-23:True:True\n", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchInsufficient, _ := newHookOrchestrator(t, cfg, insufficientRun, insufficientRun) ok, reason, err := orchInsufficient.TestHookStorageReady(context.Background()) if err != nil || ok || !strings.Contains(reason, "longhorn ready+sched nodes") { t.Fatalf("expected insufficient longhorn readiness, got ok=%v reason=%q err=%v", ok, reason, err) } invalidPVC := cfg invalidPVC.Startup.StorageCriticalPVCs = []string{"invalid"} readyNodesRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") { return "titan-23:True:True\ntitan-24:True:True\n", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchInvalidPVC, _ := newHookOrchestrator(t, invalidPVC, readyNodesRun, readyNodesRun) if _, _, err := orchInvalidPVC.TestHookStorageReady(context.Background()); err == nil { t.Fatalf("expected invalid pvc entry error") } notFoundPVC := cfg notFoundPVC.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"} notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"): return "titan-23:True:True\ntitan-24:True:True\n", nil case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"): return "", fmt.Errorf("Error from server (NotFound): persistentvolumeclaims \"grafana-data\" not found") default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchNotFoundPVC, _ := newHookOrchestrator(t, notFoundPVC, notFoundRun, notFoundRun) ok, reason, err = orchNotFoundPVC.TestHookStorageReady(context.Background()) if err != nil || ok || !strings.Contains(reason, "not found") { t.Fatalf("expected pvc-not-found readiness detail, got ok=%v reason=%q err=%v", ok, reason, err) } notBoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"): return "titan-23:True:True\ntitan-24:True:True\n", nil case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"): return "Pending", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchNotBound, _ := newHookOrchestrator(t, notFoundPVC, notBoundRun, notBoundRun) ok, reason, err = orchNotBound.TestHookStorageReady(context.Background()) if err != nil || ok || !strings.Contains(reason, "phase=Pending") { t.Fatalf("expected pvc non-bound detail, got ok=%v reason=%q err=%v", ok, reason, err) } }) t.Run("wait-for-storage-ready-timeout-and-cancel", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.StorageReadyWaitSeconds = 1 cfg.Startup.StorageReadyPollSeconds = 1 cfg.Startup.StorageMinReadyNodes = 3 stuckRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") { return "titan-23:True:True\n", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchTimeout, _ := newHookOrchestrator(t, cfg, stuckRun, stuckRun) err := orchTimeout.TestHookWaitForStorageReady(context.Background()) if err == nil || !strings.Contains(err.Error(), "storage readiness not satisfied") { t.Fatalf("expected storage wait timeout, got %v", err) } cfg.Startup.StorageReadyWaitSeconds = 30 orchCanceled, _ := newHookOrchestrator(t, cfg, stuckRun, stuckRun) cancelCtx, cancel := context.WithCancel(context.Background()) cancel() err = orchCanceled.TestHookWaitForStorageReady(cancelCtx) if !errors.Is(err, context.Canceled) { t.Fatalf("expected context canceled while waiting for storage, got %v", err) } }) } // TestHookCriticalEndpointFailureBranches runs one orchestration or CLI step. // Signature: TestHookCriticalEndpointFailureBranches(t *testing.T). // Why: ensures endpoint checklist behavior remains strict when services exist but // backends are empty, missing, or recovering. func TestHookCriticalEndpointFailureBranches(t *testing.T) { t.Run("critical-endpoint-ready-matrix", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.CriticalServiceEndpoints = nil orchNone, _ := newHookOrchestrator(t, cfg, nil, nil) ok, detail, ns, svc, err := orchNone.TestHookCriticalServiceEndpointsReady(context.Background()) if err != nil || !ok || detail == "" || ns != "" || svc != "" { t.Fatalf("expected no-config success branch, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err) } cfg.Startup.CriticalServiceEndpoints = []string{"invalid"} orchInvalid, _ := newHookOrchestrator(t, cfg, nil, nil) if _, _, _, _, err := orchInvalid.TestHookCriticalServiceEndpointsReady(context.Background()); err == nil { t.Fatalf("expected invalid endpoint entry error") } cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/grafana"} notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get endpoints grafana") { return "", fmt.Errorf("Error from server (NotFound): endpoints \"grafana\" not found") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchNotFound, _ := newHookOrchestrator(t, cfg, notFoundRun, notFoundRun) ok, detail, ns, svc, err = orchNotFound.TestHookCriticalServiceEndpointsReady(context.Background()) if err != nil || ok || ns != "monitoring" || svc != "grafana" || !strings.Contains(detail, "not found") { t.Fatalf("expected endpoint not-found detail, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err) } zeroRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get endpoints grafana") { return "", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchZero, _ := newHookOrchestrator(t, cfg, zeroRun, zeroRun) ok, detail, ns, svc, err = orchZero.TestHookCriticalServiceEndpointsReady(context.Background()) if err != nil || ok || ns != "monitoring" || svc != "grafana" || !strings.Contains(detail, "endpoints=0") { t.Fatalf("expected endpoint-zero detail, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err) } }) t.Run("critical-endpoint-wait-timeout-with-autoheal", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.CriticalServiceEndpointWaitSec = 1 cfg.Startup.CriticalServiceEndpointPollSec = 1 cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/grafana"} autoHealRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get endpoints grafana"): return "", nil case name == "kubectl" && strings.Contains(command, "-n monitoring scale deployment grafana --replicas=1"): return "", nil case name == "kubectl" && strings.Contains(command, "-n monitoring rollout status deployment/grafana"): return "", fmt.Errorf("Error from server (NotFound): deployment \"grafana\" not found") case name == "kubectl" && strings.Contains(command, "-n monitoring scale statefulset grafana --replicas=1"): return "", nil case name == "kubectl" && strings.Contains(command, "-n monitoring rollout status statefulset/grafana"): return "ready", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, autoHealRun, autoHealRun) err := orch.TestHookWaitForCriticalServiceEndpoints(context.Background()) if err == nil || !strings.Contains(err.Error(), "critical service endpoint checklist not satisfied") { t.Fatalf("expected critical-endpoint wait timeout, got %v", err) } }) } // TestHookFluxHealthFailureBranches runs one orchestration or CLI step. // Signature: TestHookFluxHealthFailureBranches(t *testing.T). // Why: covers adaptive wait, convergence parsing, and immutable job self-heal error paths. func TestHookFluxHealthFailureBranches(t *testing.T) { t.Run("adaptive-flux-wait-branches", func(t *testing.T) { cfg := lifecycleConfig(t) queryErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") { return "", fmt.Errorf("query failed") } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchQueryErr, _ := newHookOrchestrator(t, cfg, queryErrRun, queryErrRun) if _, _, err := orchQueryErr.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute); err == nil { t.Fatalf("expected adaptive wait query error") } decodeErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") { return "{bad-json", nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchDecodeErr, _ := newHookOrchestrator(t, cfg, decodeErrRun, decodeErrRun) if _, _, err := orchDecodeErr.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute); err == nil { t.Fatalf("expected adaptive wait decode error") } noTimeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") { return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false,"timeout":""}}]}`, nil } return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } orchNoTimeout, _ := newHookOrchestrator(t, cfg, noTimeoutRun, noTimeoutRun) wait, reason, err := orchNoTimeout.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute) if err != nil || wait != 10*time.Minute || !strings.Contains(reason, "no explicit kustomization timeouts") { t.Fatalf("expected no-timeout branch, got wait=%s reason=%q err=%v", wait, reason, err) } }) t.Run("flux-health-ready-and-immutable-job-heal-branches", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.IgnoreFluxKustomizations = []string{"infra/ignored"} fluxItems := map[string]any{ "items": []map[string]any{ { "metadata": map[string]any{"namespace": "infra", "name": "ignored"}, "spec": map[string]any{"suspend": false, "timeout": "30s"}, "status": map[string]any{"conditions": []map[string]any{ {"type": "Ready", "status": "False", "reason": "Progressing", "message": "ignore-me"}, }}, }, { "metadata": map[string]any{"namespace": "flux-system", "name": "services"}, "spec": map[string]any{"suspend": false, "timeout": "30m"}, "status": map[string]any{"conditions": []map[string]any{ {"type": "Ready", "status": "False", "reason": "InstallFailed", "message": "job field is immutable"}, }}, }, }, } fluxJSON, err := json.Marshal(fluxItems) if err != nil { t.Fatalf("marshal flux fixture: %v", err) } jobsJSON := `{"items":[{"metadata":{"namespace":"flux-system","name":"reconcile-services","labels":{"kustomize.toolkit.fluxcd.io/name":"services"},"ownerReferences":[]},"status":{"failed":1,"succeeded":0,"conditions":[{"type":"Failed","status":"True"}]}}]}` run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"): return string(fluxJSON), nil case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"): return jobsJSON, nil case name == "kubectl" && strings.Contains(command, "-n flux-system delete job reconcile-services --wait=false"): return "", nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orch, _ := newHookOrchestrator(t, cfg, run, run) ok, detail, err := orch.TestHookFluxHealthReady(context.Background()) if err != nil || ok || !strings.Contains(strings.ToLower(detail), "immutable") { t.Fatalf("expected flux not-ready detail with immutable signal, got ok=%v detail=%q err=%v", ok, detail, err) } healed, err := orch.TestHookHealImmutableFluxJobs(context.Background()) if err != nil || !healed { t.Fatalf("expected immutable-job cleanup success, got healed=%v err=%v", healed, err) } }) t.Run("wait-for-flux-health-timeout-and-cancel", func(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.FluxHealthWaitSeconds = 1 cfg.Startup.FluxHealthPollSeconds = 1 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"): return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"InstallFailed","message":"job field is immutable"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"): return `{"items":[]}`, nil default: return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...) } } orchTimeout, _ := newHookOrchestrator(t, cfg, run, run) err := orchTimeout.TestHookWaitForFluxHealth(context.Background()) if err == nil || !strings.Contains(err.Error(), "flux convergence not satisfied") { t.Fatalf("expected flux health timeout, got %v", err) } cfg.Startup.FluxHealthWaitSeconds = 30 orchCanceled, _ := newHookOrchestrator(t, cfg, run, run) cancelCtx, cancel := context.WithCancel(context.Background()) cancel() err = orchCanceled.TestHookWaitForFluxHealth(cancelCtx) if !errors.Is(err, context.Canceled) { t.Fatalf("expected context canceled from flux wait, got %v", err) } }) }