package orchestrator import ( "context" "io" "log" "net" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/cluster" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/execx" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestHookFluxHealthAndStorageBranches runs one orchestration or CLI step. // Signature: TestHookFluxHealthAndStorageBranches(t *testing.T). // Why: exercises flux-health and storage readiness helpers directly for coverage and behavioral safety. func TestHookFluxHealthAndStorageBranches(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.FluxHealthWaitSeconds = 2 cfg.Startup.FluxHealthPollSeconds = 1 cfg.Startup.IgnoreFluxKustomizations = []string{"flux-system/skip-me"} cfg.Startup.StorageReadyWaitSeconds = 2 cfg.Startup.StorageReadyPollSeconds = 1 cfg.Startup.StorageMinReadyNodes = 1 cfg.Startup.StorageCriticalPVCs = []string{"monitoring/grafana-data"} recorder := &commandRecorder{} base := lifecycleDispatcher(recorder) fluxCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get kustomizations.kustomize.toolkit.fluxcd.io -A -o json"): recorder.record(name, args) fluxCalls++ if fluxCalls <= 1 { return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"False","reason":"Unknown","message":"waiting"}]}}]}`, nil } return `{"items":[{"metadata":{"namespace":"flux-system","name":"services"},"spec":{"suspend":false},"status":{"conditions":[{"type":"Ready","status":"True","reason":"ReconciliationSucceeded","message":"ok"}]}}]}`, nil case name == "kubectl" && strings.Contains(command, "get jobs -A -o json"): recorder.record(name, args) return `{"items":[]}`, nil case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"): recorder.record(name, args) return "lh-a:True:True\n", nil case name == "kubectl" && strings.Contains(command, "-n monitoring get pvc grafana-data"): recorder.record(name, args) return "Bound", nil default: return base(ctx, timeout, name, args...) } } orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(run, run) ok, detail, err := orch.TestHookFluxHealthReady(context.Background()) if err != nil { t.Fatalf("fluxHealthReady error: %v", err) } if ok { t.Fatalf("expected first fluxHealthReady call to be not-ready: %s", detail) } healed, err := orch.TestHookHealImmutableFluxJobs(context.Background()) if err != nil { t.Fatalf("healImmutableFluxJobs error: %v", err) } if healed { t.Fatalf("expected no immutable job heal action in this fixture") } if err := orch.TestHookWaitForFluxHealth(context.Background()); err != nil { t.Fatalf("waitForFluxHealth: %v", err) } ready, reason, readyErr := orch.TestHookStorageReady(context.Background()) if readyErr != nil { t.Fatalf("storageReady error: %v", readyErr) } if !ready { t.Fatalf("expected storage ready, reason=%s", reason) } if err := orch.TestHookWaitForStorageReady(context.Background()); err != nil { t.Fatalf("waitForStorageReady: %v", err) } } // TestHookTimeSyncAndDatastoreBranches runs one orchestration or CLI step. // Signature: TestHookTimeSyncAndDatastoreBranches(t *testing.T). // Why: covers time-sync gate and datastore preflight helpers, including parser and TCP helper paths. func TestHookTimeSyncAndDatastoreBranches(t *testing.T) { cfg := lifecycleConfig(t) cfg.ControlPlanes = []string{"titan-db", "titan-23"} cfg.Workers = []string{"titan-24"} cfg.SSHManagedNodes = []string{"titan-db", "titan-23", "titan-24"} cfg.SSHNodeHosts["titan-23"] = "titan-23" cfg.SSHNodeHosts["titan-24"] = "titan-24" cfg.Startup.TimeSyncMode = "quorum" cfg.Startup.TimeSyncQuorum = 1 cfg.Startup.TimeSyncWaitSeconds = 2 cfg.Startup.TimeSyncPollSeconds = 1 recorder := &commandRecorder{} base := lifecycleDispatcher(recorder) run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "sh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"): recorder.record(name, args) return "yes", nil case name == "ssh" && strings.Contains(command, "timedatectl show -p NTPSynchronized --value"): recorder.record(name, args) if strings.Contains(command, "titan-db") { return "yes", nil } return "no", nil case name == "ssh" && strings.Contains(command, "systemctl cat k3s"): recorder.record(name, args) return "ExecStart=/usr/local/bin/k3s server --datastore-endpoint=postgres://127.0.0.1:6543/k3s", nil default: return base(ctx, timeout, name, args...) } } orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(run, run) if err := orch.TestHookWaitForTimeSync(context.Background(), []string{"titan-db", "titan-23"}); err != nil { t.Fatalf("waitForTimeSync: %v", err) } if got := cluster.TestHookParseDatastoreEndpoint(`ExecStart=/usr/local/bin/k3s server --datastore-endpoint="postgres://db:5432/k3s"`); !strings.Contains(got, "postgres://db:5432/k3s") { t.Fatalf("unexpected datastore endpoint parse: %q", got) } if got := orch.TestHookNodeNameForHost("titan-23"); got != "titan-23" { t.Fatalf("unexpected nodeNameForHost direct match: %q", got) } if err := orch.TestHookValidateNodeInventory(); err != nil { t.Fatalf("validateNodeInventory: %v", err) } ln, err := net.Listen("tcp", "127.0.0.1:0") if err != nil { t.Fatalf("listen for tcpReachable test: %v", err) } addr := ln.Addr().String() if !orch.TestHookTCPReachable(addr, time.Second) { t.Fatalf("expected tcpReachable=true for listener %s", addr) } _ = ln.Close() if orch.TestHookTCPReachable(addr, 100*time.Millisecond) { t.Fatalf("expected tcpReachable=false after listener close") } } // TestHookChecklistAndStabilityBranches runs one orchestration or CLI step. // Signature: TestHookChecklistAndStabilityBranches(t *testing.T). // Why: covers checklist helper methods and startup stability window internals. func TestHookChecklistAndStabilityBranches(t *testing.T) { cfg := lifecycleConfig(t) cfg.Startup.ServiceChecklist = []config.ServiceChecklistCheck{ { Name: "grafana", URL: "https://metrics.bstein.dev/api/health", AcceptedStatuses: []int{200}, BodyContains: `"database":"ok"`, TimeoutSeconds: 5, }, } cfg.Startup.ServiceChecklistWaitSeconds = 1 cfg.Startup.ServiceChecklistPollSeconds = 1 cfg.Startup.ServiceChecklistStabilitySec = 1 cfg.Startup.RequireWorkloadConvergence = false recorder := &commandRecorder{} base := lifecycleDispatcher(recorder) serviceCalls := 0 run := func(ctx context.Context, timeout time.Duration, name string, args ...string) (string, error) { command := name + " " + strings.Join(args, " ") switch { case name == "kubectl" && strings.Contains(command, "get pods -A -o json"): recorder.record(name, args) return `{"items":[]}`, nil case name == "kubectl" && strings.Contains(command, "get deploy,statefulset -A -o json"): recorder.record(name, args) return `{"items":[{"kind":"Deployment","metadata":{"namespace":"monitoring","name":"grafana"},"spec":{"replicas":1,"template":{"spec":{}}},"status":{"readyReplicas":1}}]}`, nil case name == "kubectl" && strings.Contains(command, "get deployment -A -o jsonpath="): recorder.record(name, args) return "monitoring\tgrafana\t1\n", nil case name == "kubectl" && strings.Contains(command, "get statefulset -A -o jsonpath="): recorder.record(name, args) return "", nil case name == "kubectl" && strings.Contains(command, "get ingress -A -o json"): recorder.record(name, args) return `{"items":[]}`, nil case name == "kubectl" && strings.Contains(command, "nodes.longhorn.io"): recorder.record(name, args) return "lh-a:True:True\n", nil case name == "curl": recorder.record(name, args) serviceCalls++ if serviceCalls == 1 { return "503", nil } return "200", nil default: return base(ctx, timeout, name, args...) } } orch := cluster.New(cfg, &execx.Runner{DryRun: false}, state.New(cfg.State.RunHistoryPath), log.New(io.Discard, "", 0)) orch.SetCommandOverrides(run, run) ok, detail := orch.TestHookPostStartProbesReady(context.Background()) if !ok || !strings.Contains(detail, "no probes configured") { t.Fatalf("expected no-probes ready branch, got ok=%v detail=%q", ok, detail) } code, err := orch.TestHookHTTPProbe(context.Background(), "https://example.invalid") if err != nil { t.Fatalf("unexpected HTTP probe error with recorder override: %v", err) } if code != 503 { t.Fatalf("expected first synthetic HTTP probe code=503, got %d", code) } // Direct checklist readiness path should always return a non-empty status detail. _, checkDetail := orch.TestHookServiceChecklistReady(context.Background()) if checkDetail == "" { t.Fatalf("expected service checklist detail to be populated") } // Force stability helper path through synthetic kubectl outputs. if err := orch.TestHookWaitForStabilityWindow(context.Background()); err != nil && !strings.Contains(err.Error(), "stability") { t.Fatalf("unexpected stability window error: %v", err) } }