ananke/testing/orchestrator/hooks_health_failure_matrix_test.go

339 lines
16 KiB
Go
Raw Normal View History

package orchestrator
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"testing"
"time"
)
// TestHookStorageFailureBranches runs one orchestration or CLI step.
// Signature: TestHookStorageFailureBranches(t *testing.T).
// Why: validates storage readiness edge paths so startup does not mark success while
// Longhorn/PVC dependencies are still degraded.
func TestHookStorageFailureBranches(t *testing.T) {
t.Run("storage-ready-branch-matrix", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.StorageMinReadyNodes = 2
cfg.Startup.StorageCriticalPVCs = []string{}
queryErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
return "", fmt.Errorf("query failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchQueryErr, _ := newHookOrchestrator(t, cfg, queryErrRun, queryErrRun)
if _, _, err := orchQueryErr.TestHookStorageReady(context.Background()); err == nil {
t.Fatalf("expected longhorn query error branch")
}
insufficientRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
return "titan-23:True:True\n", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchInsufficient, _ := newHookOrchestrator(t, cfg, insufficientRun, insufficientRun)
ok, reason, err := orchInsufficient.TestHookStorageReady(context.Background())
if err != nil || ok || !strings.Contains(reason, "longhorn ready+sched nodes") {
t.Fatalf("expected insufficient longhorn readiness, got ok=%v reason=%q err=%v", ok, reason, err)
}
invalidPVC := cfg
invalidPVC.Startup.StorageCriticalPVCs = []string{"invalid"}
readyNodesRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
return "titan-23:True:True\ntitan-24:True:True\n", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchInvalidPVC, _ := newHookOrchestrator(t, invalidPVC, readyNodesRun, readyNodesRun)
if _, _, err := orchInvalidPVC.TestHookStorageReady(context.Background()); err == nil {
t.Fatalf("expected invalid pvc entry error")
}
notFoundPVC := cfg
notFoundPVC.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"}
notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
return "titan-23:True:True\ntitan-24:True:True\n", nil
case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
return "", fmt.Errorf("Error from server (NotFound): persistentvolumeclaims \"grafana-data\" not found")
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchNotFoundPVC, _ := newHookOrchestrator(t, notFoundPVC, notFoundRun, notFoundRun)
ok, reason, err = orchNotFoundPVC.TestHookStorageReady(context.Background())
if err != nil || ok || !strings.Contains(reason, "not found") {
t.Fatalf("expected pvc-not-found readiness detail, got ok=%v reason=%q err=%v", ok, reason, err)
}
notBoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"):
return "titan-23:True:True\ntitan-24:True:True\n", nil
case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"):
return "Pending", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchNotBound, _ := newHookOrchestrator(t, notFoundPVC, notBoundRun, notBoundRun)
ok, reason, err = orchNotBound.TestHookStorageReady(context.Background())
if err != nil || ok || !strings.Contains(reason, "phase=Pending") {
t.Fatalf("expected pvc non-bound detail, got ok=%v reason=%q err=%v", ok, reason, err)
}
})
t.Run("wait-for-storage-ready-timeout-and-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.StorageReadyWaitSeconds = 1
cfg.Startup.StorageReadyPollSeconds = 1
cfg.Startup.StorageMinReadyNodes = 3
stuckRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "nodes.longhorn.io") {
return "titan-23:True:True\n", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchTimeout, _ := newHookOrchestrator(t, cfg, stuckRun, stuckRun)
err := orchTimeout.TestHookWaitForStorageReady(context.Background())
if err == nil || !strings.Contains(err.Error(), "storage readiness not satisfied") {
t.Fatalf("expected storage wait timeout, got %v", err)
}
cfg.Startup.StorageReadyWaitSeconds = 30
orchCanceled, _ := newHookOrchestrator(t, cfg, stuckRun, stuckRun)
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
err = orchCanceled.TestHookWaitForStorageReady(cancelCtx)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context canceled while waiting for storage, got %v", err)
}
})
}
// TestHookCriticalEndpointFailureBranches runs one orchestration or CLI step.
// Signature: TestHookCriticalEndpointFailureBranches(t *testing.T).
// Why: ensures endpoint checklist behavior remains strict when services exist but
// backends are empty, missing, or recovering.
func TestHookCriticalEndpointFailureBranches(t *testing.T) {
t.Run("critical-endpoint-ready-matrix", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.CriticalServiceEndpoints = nil
orchNone, _ := newHookOrchestrator(t, cfg, nil, nil)
ok, detail, ns, svc, err := orchNone.TestHookCriticalServiceEndpointsReady(context.Background())
if err != nil || !ok || detail == "" || ns != "" || svc != "" {
t.Fatalf("expected no-config success branch, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
}
cfg.Startup.CriticalServiceEndpoints = []string{"invalid"}
orchInvalid, _ := newHookOrchestrator(t, cfg, nil, nil)
if _, _, _, _, err := orchInvalid.TestHookCriticalServiceEndpointsReady(context.Background()); err == nil {
t.Fatalf("expected invalid endpoint entry error")
}
cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/grafana"}
notFoundRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get endpoints grafana") {
return "", fmt.Errorf("Error from server (NotFound): endpoints \"grafana\" not found")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchNotFound, _ := newHookOrchestrator(t, cfg, notFoundRun, notFoundRun)
ok, detail, ns, svc, err = orchNotFound.TestHookCriticalServiceEndpointsReady(context.Background())
if err != nil || ok || ns != "monitoring" || svc != "grafana" || !strings.Contains(detail, "not found") {
t.Fatalf("expected endpoint not-found detail, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
}
zeroRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get endpoints grafana") {
return "", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchZero, _ := newHookOrchestrator(t, cfg, zeroRun, zeroRun)
ok, detail, ns, svc, err = orchZero.TestHookCriticalServiceEndpointsReady(context.Background())
if err != nil || ok || ns != "monitoring" || svc != "grafana" || !strings.Contains(detail, "endpoints=0") {
t.Fatalf("expected endpoint-zero detail, got ok=%v detail=%q ns=%q svc=%q err=%v", ok, detail, ns, svc, err)
}
})
t.Run("critical-endpoint-wait-timeout-with-autoheal", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.CriticalServiceEndpointWaitSec = 1
cfg.Startup.CriticalServiceEndpointPollSec = 1
cfg.Startup.CriticalServiceEndpoints = []string{"monitoring/grafana"}
autoHealRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get endpoints grafana"):
return "", nil
case name == "kubectl" && strings.Contains(command, "-n monitoring scale deployment grafana --replicas=1"):
return "", nil
case name == "kubectl" && strings.Contains(command, "-n monitoring rollout status deployment/grafana"):
return "", fmt.Errorf("Error from server (NotFound): deployment \"grafana\" not found")
case name == "kubectl" && strings.Contains(command, "-n monitoring scale statefulset grafana --replicas=1"):
return "", nil
case name == "kubectl" && strings.Contains(command, "-n monitoring rollout status statefulset/grafana"):
return "ready", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, autoHealRun, autoHealRun)
err := orch.TestHookWaitForCriticalServiceEndpoints(context.Background())
if err == nil || !strings.Contains(err.Error(), "critical service endpoint checklist not satisfied") {
t.Fatalf("expected critical-endpoint wait timeout, got %v", err)
}
})
}
// TestHookFluxHealthFailureBranches runs one orchestration or CLI step.
// Signature: TestHookFluxHealthFailureBranches(t *testing.T).
// Why: covers adaptive wait, convergence parsing, and immutable job self-heal error paths.
func TestHookFluxHealthFailureBranches(t *testing.T) {
t.Run("adaptive-flux-wait-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
queryErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
return "", fmt.Errorf("query failed")
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchQueryErr, _ := newHookOrchestrator(t, cfg, queryErrRun, queryErrRun)
if _, _, err := orchQueryErr.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute); err == nil {
t.Fatalf("expected adaptive wait query error")
}
decodeErrRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
return "{bad-json", nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchDecodeErr, _ := newHookOrchestrator(t, cfg, decodeErrRun, decodeErrRun)
if _, _, err := orchDecodeErr.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute); err == nil {
t.Fatalf("expected adaptive wait decode error")
}
noTimeoutRun := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
if name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json") {
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false,"timeout":""}}]}`, nil
}
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
orchNoTimeout, _ := newHookOrchestrator(t, cfg, noTimeoutRun, noTimeoutRun)
wait, reason, err := orchNoTimeout.TestHookAdaptiveFluxHealthWait(context.Background(), 10*time.Minute)
if err != nil || wait != 10*time.Minute || !strings.Contains(reason, "no explicit kustomization timeouts") {
t.Fatalf("expected no-timeout branch, got wait=%s reason=%q err=%v", wait, reason, err)
}
})
t.Run("flux-health-ready-and-immutable-job-heal-branches", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.IgnoreFluxKustomizations = []string{"infra/ignored"}
fluxItems := map[string]any{
"items": []map[string]any{
{
"metadata": map[string]any{"namespace": "infra", "name": "ignored"},
"spec": map[string]any{"suspend": false, "timeout": "30s"},
"status": map[string]any{"conditions": []map[string]any{
{"type": "Ready", "status": "False", "reason": "Progressing", "message": "ignore-me"},
}},
},
{
"metadata": map[string]any{"namespace": "flux-system", "name": "services"},
"spec": map[string]any{"suspend": false, "timeout": "30m"},
"status": map[string]any{"conditions": []map[string]any{
{"type": "Ready", "status": "False", "reason": "InstallFailed", "message": "job field is immutable"},
}},
},
},
}
fluxJSON, err := json.Marshal(fluxItems)
if err != nil {
t.Fatalf("marshal flux fixture: %v", err)
}
jobsJSON := `{"items":[{"metadata":{"namespace":"flux-system","name":"reconcile-services","labels":{"kustomize.toolkit.fluxcd.io/name":"services"},"ownerReferences":[]},"status":{"failed":1,"succeeded":0,"conditions":[{"type":"Failed","status":"True"}]}}]}`
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return string(fluxJSON), nil
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
return jobsJSON, nil
case name == "kubectl" && strings.Contains(command, "-n flux-system delete job reconcile-services --wait=false"):
return "", nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orch, _ := newHookOrchestrator(t, cfg, run, run)
ok, detail, err := orch.TestHookFluxHealthReady(context.Background())
if err != nil || ok || !strings.Contains(strings.ToLower(detail), "immutable") {
t.Fatalf("expected flux not-ready detail with immutable signal, got ok=%v detail=%q err=%v", ok, detail, err)
}
healed, err := orch.TestHookHealImmutableFluxJobs(context.Background())
if err != nil || !healed {
t.Fatalf("expected immutable-job cleanup success, got healed=%v err=%v", healed, err)
}
})
t.Run("wait-for-flux-health-timeout-and-cancel", func(t *testing.T) {
cfg := lifecycleConfig(t)
cfg.Startup.FluxHealthWaitSeconds = 1
cfg.Startup.FluxHealthPollSeconds = 1
run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) {
command := name + " " + strings.Join(args, " ")
switch {
case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"):
return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"InstallFailed","message":"job field is immutable"}]}}]}`, nil
case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"):
return `{"items":[]}`, nil
default:
return lifecycleDispatcher(&commandRecorder{})(ctx, timeout, name, args...)
}
}
orchTimeout, _ := newHookOrchestrator(t, cfg, run, run)
err := orchTimeout.TestHookWaitForFluxHealth(context.Background())
if err == nil || !strings.Contains(err.Error(), "flux convergence not satisfied") {
t.Fatalf("expected flux health timeout, got %v", err)
}
cfg.Startup.FluxHealthWaitSeconds = 30
orchCanceled, _ := newHookOrchestrator(t, cfg, run, run)
cancelCtx, cancel := context.WithCancel(context.Background())
cancel()
err = orchCanceled.TestHookWaitForFluxHealth(cancelCtx)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context canceled from flux wait, got %v", err)
}
})
}