339 lines
16 KiB
Go
339 lines
16 KiB
Go
|
|
package orchestrator
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"encoding/json"
|
||
|
|
"errors"
|
||
|
|
"fmt"
|
||
|
|
"strings"
|
||
|
|
"testing"
|
||
|
|
"time"
|
||
|
|
)
|
||
|
|
|
||
|
|
// TestHookStorageFailureBranches runs one orchestration or CLI step.
|
||
|
|
// Signature: TestHookStorageFailureBranches(t *testing.T).
|
||
|
|
// Why: validates storage readiness edge paths so startup does not mark success while
|
||
|
|
// Longhorn/PVC dependencies are still degraded.
|
||
|
|
func TestHookStorageFailureBranches(t *testing.T) {
|
||
|
|
t.Run("storage-ready-branch-matrix", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.StorageMinReadyNodes = 2
|
||
|
|
cfg.Startup.StorageCriticalPVCs = []string{}
|
||
|
|
|
||
|
|
queryErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
|
||
|
|
return "", fmt.Errorf("query failed")
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchQueryErr, _ := newHookOrchestrator(t, cfg, queryErrRun, queryErrRun)
|
||
|
|
if _, _, err := orchQueryErr.TestHookStorageReady(context.Background()); err == nil {
|
||
|
|
t.Fatalf("expected longhorn query error branch")
|
||
|
|
}
|
||
|
|
|
||
|
|
insufficientRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
|
||
|
|
return "titan-23:True:True\n", nil
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchInsufficient, _ := newHookOrchestrator(t, cfg, insufficientRun, insufficientRun)
|
||
|
|
ok, reason, err := orchInsufficient.TestHookStorageReady(context.Background())
|
||
|
|
if err != nil || ok || !strings.Contains(reason, "longhorn ready+sched nodes") {
|
||
|
|
t.Fatalf("expected insufficient longhorn readiness, got ok=%v reason=%q err=%v", ok, reason, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
invalidPVC := cfg
|
||
|
|
invalidPVC.Startup.StorageCriticalPVCs = []string{"invalid"}
|
||
|
|
readyNodesRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
|
||
|
|
return "titan-23:True:True\ntitan-24:True:True\n", nil
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchInvalidPVC, _ := newHookOrchestrator(t, invalidPVC, readyNodesRun, readyNodesRun)
|
||
|
|
if _, _, err := orchInvalidPVC.TestHookStorageReady(context.Background()); err == nil {
|
||
|
|
t.Fatalf("expected invalid pvc entry error")
|
||
|
|
}
|
||
|
|
|
||
|
|
notFoundPVC := cfg
|
||
|
|
notFoundPVC.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}
|
||
|
|
notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
|
||
|
|
return "titan-23:True:True\ntitan-24:True:True\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
|
||
|
|
return "", fmt.Errorf("Error from server (NotFound): persistentvolumeclaims \"grafana-data\" not found")
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchNotFoundPVC, _ := newHookOrchestrator(t, notFoundPVC, notFoundRun, notFoundRun)
|
||
|
|
ok, reason, err = orchNotFoundPVC.TestHookStorageReady(context.Background())
|
||
|
|
if err != nil || ok || !strings.Contains(reason, "not found") {
|
||
|
|
t.Fatalf("expected pvc-not-found readiness detail, got ok=%v reason=%q err=%v", ok, reason, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
notBoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
|
||
|
|
return "titan-23:True:True\ntitan-24:True:True\n", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
|
||
|
|
return "Pending", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchNotBound, _ := newHookOrchestrator(t, notFoundPVC, notBoundRun, notBoundRun)
|
||
|
|
ok, reason, err = orchNotBound.TestHookStorageReady(context.Background())
|
||
|
|
if err != nil || ok || !strings.Contains(reason, "phase=Pending") {
|
||
|
|
t.Fatalf("expected pvc non-bound detail, got ok=%v reason=%q err=%v", ok, reason, err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("wait-for-storage-ready-timeout-and-cancel", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.StorageReadyWaitSeconds = 1
|
||
|
|
cfg.Startup.StorageReadyPollSeconds = 1
|
||
|
|
cfg.Startup.StorageMinReadyNodes = 3
|
||
|
|
|
||
|
|
stuckRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
|
||
|
|
return "titan-23:True:True\n", nil
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchTimeout, _ := newHookOrchestrator(t, cfg, stuckRun, stuckRun)
|
||
|
|
err := orchTimeout.TestHookWaitForStorageReady(context.Background())
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "storage readiness not satisfied") {
|
||
|
|
t.Fatalf("expected storage wait timeout, got %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
cfg.Startup.StorageReadyWaitSeconds = 30
|
||
|
|
orchCanceled, _ := newHookOrchestrator(t, cfg, stuckRun, stuckRun)
|
||
|
|
cancelCtx, cancel := context.WithCancel(context.Background())
|
||
|
|
cancel()
|
||
|
|
err = orchCanceled.TestHookWaitForStorageReady(cancelCtx)
|
||
|
|
if !errors.Is(err, context.Canceled) {
|
||
|
|
t.Fatalf("expected context canceled while waiting for storage, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestHookCriticalEndpointFailureBranches runs one orchestration or CLI step.
|
||
|
|
// Signature: TestHookCriticalEndpointFailureBranches(t *testing.T).
|
||
|
|
// Why: ensures endpoint checklist behavior remains strict when services exist but
|
||
|
|
// backends are empty, missing, or recovering.
|
||
|
|
func TestHookCriticalEndpointFailureBranches(t *testing.T) {
|
||
|
|
t.Run("critical-endpoint-ready-matrix", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.CriticalServiceEndpoints = nil
|
||
|
|
orchNone, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||
|
|
ok, detail, ns, svc, err := orchNone.TestHookCriticalServiceEndpointsReady(context.Background())
|
||
|
|
if err != nil || !ok || detail == "" || ns != "" || svc != "" {
|
||
|
|
t.Fatalf("expected no-config success branch, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
cfg.Startup.CriticalServiceEndpoints = []string{"invalid"}
|
||
|
|
orchInvalid, _ := newHookOrchestrator(t, cfg, nil, nil)
|
||
|
|
if _, _, _, _, err := orchInvalid.TestHookCriticalServiceEndpointsReady(context.Background()); err == nil {
|
||
|
|
t.Fatalf("expected invalid endpoint entry error")
|
||
|
|
}
|
||
|
|
|
||
|
|
cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/grafana"}
|
||
|
|
notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "get endpoints grafana") {
|
||
|
|
return "", fmt.Errorf("Error from server (NotFound): endpoints \"grafana\" not found")
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchNotFound, _ := newHookOrchestrator(t, cfg, notFoundRun, notFoundRun)
|
||
|
|
ok, detail, ns, svc, err = orchNotFound.TestHookCriticalServiceEndpointsReady(context.Background())
|
||
|
|
if err != nil || ok || ns != "monitoring" || svc != "grafana" || !strings.Contains(detail, "not found") {
|
||
|
|
t.Fatalf("expected endpoint not-found detail, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
zeroRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "get endpoints grafana") {
|
||
|
|
return "", nil
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchZero, _ := newHookOrchestrator(t, cfg, zeroRun, zeroRun)
|
||
|
|
ok, detail, ns, svc, err = orchZero.TestHookCriticalServiceEndpointsReady(context.Background())
|
||
|
|
if err != nil || ok || ns != "monitoring" || svc != "grafana" || !strings.Contains(detail, "endpoints=0") {
|
||
|
|
t.Fatalf("expected endpoint-zero detail, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("critical-endpoint-wait-timeout-with-autoheal", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.CriticalServiceEndpointWaitSec = 1
|
||
|
|
cfg.Startup.CriticalServiceEndpointPollSec = 1
|
||
|
|
cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/grafana"}
|
||
|
|
|
||
|
|
autoHealRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get endpoints grafana"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring scale deployment grafana --replicas=1"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring rollout status deployment/grafana"):
|
||
|
|
return "", fmt.Errorf("Error from server (NotFound): deployment \"grafana\" not found")
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring scale statefulset grafana --replicas=1"):
|
||
|
|
return "", nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n monitoring rollout status statefulset/grafana"):
|
||
|
|
return "ready", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orch, _ := newHookOrchestrator(t, cfg, autoHealRun, autoHealRun)
|
||
|
|
err := orch.TestHookWaitForCriticalServiceEndpoints(context.Background())
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "critical service endpoint checklist not satisfied") {
|
||
|
|
t.Fatalf("expected critical-endpoint wait timeout, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// TestHookFluxHealthFailureBranches runs one orchestration or CLI step.
|
||
|
|
// Signature: TestHookFluxHealthFailureBranches(t *testing.T).
|
||
|
|
// Why: covers adaptive wait, convergence parsing, and immutable job self-heal error paths.
|
||
|
|
func TestHookFluxHealthFailureBranches(t *testing.T) {
|
||
|
|
t.Run("adaptive-flux-wait-branches", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
|
||
|
|
queryErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
|
||
|
|
return "", fmt.Errorf("query failed")
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchQueryErr, _ := newHookOrchestrator(t, cfg, queryErrRun, queryErrRun)
|
||
|
|
if _, _, err := orchQueryErr.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute); err == nil {
|
||
|
|
t.Fatalf("expected adaptive wait query error")
|
||
|
|
}
|
||
|
|
|
||
|
|
decodeErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
|
||
|
|
return "{bad-json", nil
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchDecodeErr, _ := newHookOrchestrator(t, cfg, decodeErrRun, decodeErrRun)
|
||
|
|
if _, _, err := orchDecodeErr.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute); err == nil {
|
||
|
|
t.Fatalf("expected adaptive wait decode error")
|
||
|
|
}
|
||
|
|
|
||
|
|
noTimeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
|
||
|
|
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false,"timeout":""}}]}`, nil
|
||
|
|
}
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
orchNoTimeout, _ := newHookOrchestrator(t, cfg, noTimeoutRun, noTimeoutRun)
|
||
|
|
wait, reason, err := orchNoTimeout.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute)
|
||
|
|
if err != nil || wait != 10*time.Minute || !strings.Contains(reason, "no explicit kustomization timeouts") {
|
||
|
|
t.Fatalf("expected no-timeout branch, got wait=%s reason=%q err=%v", wait, reason, err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("flux-health-ready-and-immutable-job-heal-branches", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.IgnoreFluxKustomizations = []string{"infra/ignored"}
|
||
|
|
|
||
|
|
fluxItems := map[string]any{
|
||
|
|
"items": []map[string]any{
|
||
|
|
{
|
||
|
|
"metadata": map[string]any{"namespace": "infra", "name": "ignored"},
|
||
|
|
"spec": map[string]any{"suspend": false, "timeout": "30s"},
|
||
|
|
"status": map[string]any{"conditions": []map[string]any{
|
||
|
|
{"type": "Ready", "status": "False", "reason": "Progressing", "message": "ignore-me"},
|
||
|
|
}},
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"metadata": map[string]any{"namespace": "flux-system", "name": "services"},
|
||
|
|
"spec": map[string]any{"suspend": false, "timeout": "30m"},
|
||
|
|
"status": map[string]any{"conditions": []map[string]any{
|
||
|
|
{"type": "Ready", "status": "False", "reason": "InstallFailed", "message": "job field is immutable"},
|
||
|
|
}},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
}
|
||
|
|
fluxJSON, err := json.Marshal(fluxItems)
|
||
|
|
if err != nil {
|
||
|
|
t.Fatalf("marshal flux fixture: %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
jobsJSON := `{"items":[{"metadata":{"namespace":"flux-system","name":"reconcile-services","labels":{"kustomize.toolkit.fluxcd.io/name":"services"},"ownerReferences":[]},"status":{"failed":1,"succeeded":0,"conditions":[{"type":"Failed","status":"True"}]}}]}`
|
||
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||
|
|
return string(fluxJSON), nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
|
||
|
|
return jobsJSON, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "-n flux-system delete job reconcile-services --wait=false"):
|
||
|
|
return "", nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orch, _ := newHookOrchestrator(t, cfg, run, run)
|
||
|
|
|
||
|
|
ok, detail, err := orch.TestHookFluxHealthReady(context.Background())
|
||
|
|
if err != nil || ok || !strings.Contains(strings.ToLower(detail), "immutable") {
|
||
|
|
t.Fatalf("expected flux not-ready detail with immutable signal, got ok=%v detail=%q err=%v", ok, detail, err)
|
||
|
|
}
|
||
|
|
|
||
|
|
healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
|
||
|
|
if err != nil || !healed {
|
||
|
|
t.Fatalf("expected immutable-job cleanup success, got healed=%v err=%v", healed, err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
t.Run("wait-for-flux-health-timeout-and-cancel", func(t *testing.T) {
|
||
|
|
cfg := lifecycleConfig(t)
|
||
|
|
cfg.Startup.FluxHealthWaitSeconds = 1
|
||
|
|
cfg.Startup.FluxHealthPollSeconds = 1
|
||
|
|
|
||
|
|
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
|
||
|
|
command := name + " " + strings.Join(args, " ")
|
||
|
|
switch {
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
|
||
|
|
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"InstallFailed","message":"job field is immutable"}]}}]}`, nil
|
||
|
|
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
|
||
|
|
return `{"items":[]}`, nil
|
||
|
|
default:
|
||
|
|
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
orchTimeout, _ := newHookOrchestrator(t, cfg, run, run)
|
||
|
|
err := orchTimeout.TestHookWaitForFluxHealth(context.Background())
|
||
|
|
if err == nil || !strings.Contains(err.Error(), "flux convergence not satisfied") {
|
||
|
|
t.Fatalf("expected flux health timeout, got %v", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
cfg.Startup.FluxHealthWaitSeconds = 30
|
||
|
|
orchCanceled, _ := newHookOrchestrator(t, cfg, run, run)
|
||
|
|
cancelCtx, cancel := context.WithCancel(context.Background())
|
||
|
|
cancel()
|
||
|
|
err = orchCanceled.TestHookWaitForFluxHealth(cancelCtx)
|
||
|
|
if !errors.Is(err, context.Canceled) {
|
||
|
|
t.Fatalf("expected context canceled from flux wait, got %v", err)
|
||
|
|
}
|
||
|
|
})
|
||
|
|
}
|