package cluster import ( "context" "log" "net/http" "net/http/httptest" "os" "reflect" "strings" "testing" "time" "scm.bstein.dev/bstein/ananke/internal/config" "scm.bstein.dev/bstein/ananke/internal/state" ) // TestParseVaultSealed runs one orchestration or CLI step. // Signature: TestParseVaultSealed(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseVaultSealed(t *testing.T) { sealed, err := parseVaultSealed(`{"initialized":true,"sealed":true}`) if err != nil { t.Fatalf("parse sealed=true: %v", err) } if !sealed { t.Fatalf("expected sealed=true") } sealed, err = parseVaultSealed(`{"initialized":true,"sealed":false}`) if err != nil { t.Fatalf("parse sealed=false: %v", err) } if sealed { t.Fatalf("expected sealed=false") } } // TestParseVaultSealedRejectsEmpty runs one orchestration or CLI step. // Signature: TestParseVaultSealedRejectsEmpty(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseVaultSealedRejectsEmpty(t *testing.T) { if _, err := parseVaultSealed(" "); err == nil { t.Fatalf("expected parse error for empty status payload") } } // TestParseVaultSealedWithKubectlPreamble runs one orchestration or CLI step. // Signature: TestParseVaultSealedWithKubectlPreamble(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseVaultSealedWithKubectlPreamble(t *testing.T) { raw := "Defaulted container \"vault\" out of: vault, setup-config (init)\n{\"sealed\":true,\"initialized\":true}\n" sealed, err := parseVaultSealed(raw) if err != nil { t.Fatalf("parse with preamble: %v", err) } if !sealed { t.Fatalf("expected sealed=true from payload with preamble") } } // TestFallbackWorkersFromInventoryUsesManagedNodes runs one orchestration or CLI step. // Signature: TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestFallbackWorkersFromInventoryUsesManagedNodes(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, SSHManagedNodes: []string{ "titan-db", "titan-0a", "titan-15", "titan-17", }, }, log: log.New(os.Stdout, "", 0), } got := orch.fallbackWorkersFromInventory() want := []string{"titan-15", "titan-17", "titan-db"} if !reflect.DeepEqual(got, want) { t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want) } } // TestFallbackWorkersFromInventoryFallsBackToHosts runs one orchestration or CLI step. // Signature: TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestFallbackWorkersFromInventoryFallsBackToHosts(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ ControlPlanes: []string{"titan-0a", "titan-0b", "titan-0c"}, SSHNodeHosts: map[string]string{ "titan-0a": "192.168.22.11", "titan-22": "192.168.22.22", "titan-24": "192.168.22.26", }, }, log: log.New(os.Stdout, "", 0), } got := orch.fallbackWorkersFromInventory() want := []string{"titan-22", "titan-24"} if !reflect.DeepEqual(got, want) { t.Fatalf("fallback workers mismatch: got=%v want=%v", got, want) } } // TestIntentFreshTreatsZeroTimestampAsFresh runs one orchestration or CLI step. // Signature: TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestIntentFreshTreatsZeroTimestampAsFresh(t *testing.T) { if !intentFresh(state.Intent{}, 30*time.Second) { t.Fatalf("zero updated_at intent should be treated as fresh") } } // TestIntentFreshRespectsAge runs one orchestration or CLI step. // Signature: TestIntentFreshRespectsAge(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestIntentFreshRespectsAge(t *testing.T) { stale := state.Intent{UpdatedAt: time.Now().Add(-2 * time.Minute)} fresh := state.Intent{UpdatedAt: time.Now().Add(-20 * time.Second)} if intentFresh(stale, 30*time.Second) { t.Fatalf("expected stale intent to be considered not fresh") } if !intentFresh(fresh, 30*time.Second) { t.Fatalf("expected recent intent to be considered fresh") } } // TestCoordinationPeersDedupesAndIncludesForwardHost runs one orchestration or CLI step. // Signature: TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestCoordinationPeersDedupesAndIncludesForwardHost(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ Coordination: config.Coordination{ PeerHosts: []string{"titan-24", "titan-db", "titan-24", " "}, ForwardShutdownHost: "titan-db", }, }, } got := orch.coordinationPeers() want := []string{"titan-24", "titan-db"} if !reflect.DeepEqual(got, want) { t.Fatalf("coordination peers mismatch: got=%v want=%v", got, want) } } // TestWorkloadTargetsIgnoredNodesByNodeSelector runs one orchestration or CLI step. // Signature: TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestWorkloadTargetsIgnoredNodesByNodeSelector(t *testing.T) { spec := podSpec{ NodeSelector: map[string]string{ "kubernetes.io/hostname": "titan-22", }, } ignored := map[string]struct{}{"titan-22": {}} if !workloadTargetsIgnoredNodes(spec, ignored) { t.Fatalf("expected workload to target ignored node via nodeSelector") } } // TestParseWorkloadIgnoreRules runs one orchestration or CLI step. // Signature: TestParseWorkloadIgnoreRules(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseWorkloadIgnoreRules(t *testing.T) { rules := parseWorkloadIgnoreRules([]string{ "maintenance/metis", "crypto/statefulset/monerod", }) if len(rules) != 2 { t.Fatalf("expected 2 ignore rules, got %d", len(rules)) } if !workloadIgnored(rules, "maintenance", "deployment", "metis") { t.Fatalf("expected namespace/name rule to match") } if !workloadIgnored(rules, "crypto", "statefulset", "monerod") { t.Fatalf("expected namespace/kind/name rule to match") } if workloadIgnored(rules, "crypto", "deployment", "monerod") { t.Fatalf("did not expect mismatched kind to match") } } // TestNamespaceCandidatesFromIgnoreKustomizations runs one orchestration or CLI step. // Signature: TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestNamespaceCandidatesFromIgnoreKustomizations(t *testing.T) { got := namespaceCandidatesFromIgnoreKustomizations([]string{ "flux-system/jellyfin", "flux-system/outline", }) if _, ok := got["jellyfin"]; !ok { t.Fatalf("expected jellyfin namespace candidate") } if _, ok := got["outline"]; !ok { t.Fatalf("expected outline namespace candidate") } } // TestProbeStatusAcceptedRejects404 runs one orchestration or CLI step. // Signature: TestProbeStatusAcceptedRejects404(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestProbeStatusAcceptedRejects404(t *testing.T) { if probeStatusAccepted("https://metrics.bstein.dev/login", 404) { t.Fatalf("expected 404 probe status to be rejected") } } // TestParseFluxKustomizationTimeout runs one orchestration or CLI step. // Signature: TestParseFluxKustomizationTimeout(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestParseFluxKustomizationTimeout(t *testing.T) { if got := parseFluxKustomizationTimeout("30m"); got != 30*time.Minute { t.Fatalf("expected 30m duration, got %s", got) } if got := parseFluxKustomizationTimeout("5m30s"); got != 5*time.Minute+30*time.Second { t.Fatalf("expected 5m30s duration, got %s", got) } if got := parseFluxKustomizationTimeout(""); got != 0 { t.Fatalf("expected zero duration for empty timeout, got %s", got) } if got := parseFluxKustomizationTimeout("not-a-duration"); got != 0 { t.Fatalf("expected zero duration for invalid timeout, got %s", got) } } // TestServiceCheckReadyRequiresBodyContains runs one orchestration or CLI step. // Signature: TestServiceCheckReadyRequiresBodyContains(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestServiceCheckReadyRequiresBodyContains(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte(`{"database":"ok"}`)) })) defer srv.Close() orch := &Orchestrator{ log: log.New(os.Stdout, "", 0), } ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ Name: "grafana-api", URL: srv.URL, AcceptedStatuses: []int{200}, BodyContains: `"database":"ok"`, TimeoutSeconds: 5, }) if !ok { t.Fatalf("expected service check to pass, detail=%s", detail) } } // TestServiceCheckReadyBodyContainsIgnoresWhitespace runs one orchestration or CLI step. // Signature: TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte("{\n \"database\": \"ok\"\n}\n")) })) defer srv.Close() orch := &Orchestrator{ log: log.New(os.Stdout, "", 0), } ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ Name: "grafana-api", URL: srv.URL, AcceptedStatuses: []int{200}, BodyContains: `"database":"ok"`, TimeoutSeconds: 5, }) if !ok { t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail) } } // TestServiceCheckReadyRequiresLocationContains runs one orchestration or CLI step. // Signature: TestServiceCheckReadyRequiresLocationContains(t *testing.T). // Why: startup checks must validate redirect targets for OIDC-gated services. func TestServiceCheckReadyRequiresLocationContains(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=logs") w.WriteHeader(http.StatusFound) })) defer srv.Close() orch := &Orchestrator{ log: log.New(os.Stdout, "", 0), } ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ Name: "logging-oidc-redirect", URL: srv.URL, AcceptedStatuses: []int{302}, LocationContains: "client_id=logs", TimeoutSeconds: 5, }) if !ok { t.Fatalf("expected location-aware service check to pass, detail=%s", detail) } } // TestServiceCheckReadyRejectsMissingLocationMarker runs one orchestration or CLI step. // Signature: TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T). // Why: prevents false positives when redirects point somewhere unexpected. func TestServiceCheckReadyRejectsMissingLocationMarker(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Location", "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/auth?client_id=wrong") w.WriteHeader(http.StatusFound) })) defer srv.Close() orch := &Orchestrator{ log: log.New(os.Stdout, "", 0), } ok, detail := orch.serviceCheckReady(context.Background(), config.ServiceChecklistCheck{ Name: "logging-oidc-redirect", URL: srv.URL, AcceptedStatuses: []int{302}, LocationContains: "client_id=logs", TimeoutSeconds: 5, }) if ok { t.Fatalf("expected location-aware service check to fail") } if !strings.Contains(detail, "location header missing expected marker") { t.Fatalf("expected missing location marker detail, got %q", detail) } } // TestChecklistFailureHostFromIngressDetail runs one orchestration or CLI step. // Signature: TestChecklistFailureHostFromIngressDetail(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestChecklistFailureHostFromIngressDetail(t *testing.T) { orch := &Orchestrator{} got := orch.checklistFailureHost("cloud.bstein.dev: unexpected status code=500") if got != "cloud.bstein.dev" { t.Fatalf("expected host cloud.bstein.dev, got %q", got) } } // TestChecklistFailureHostFromServiceCheckName runs one orchestration or CLI step. // Signature: TestChecklistFailureHostFromServiceCheckName(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestChecklistFailureHostFromServiceCheckName(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ Startup: config.Startup{ ServiceChecklist: []config.ServiceChecklistCheck{ { Name: "harbor-registry", URL: "https://registry.bstein.dev/v2/", }, }, }, }, } got := orch.checklistFailureHost("harbor-registry: unexpected status code=404") if got != "registry.bstein.dev" { t.Fatalf("expected host registry.bstein.dev, got %q", got) } } // TestChecklistFailureHostUnknown runs one orchestration or CLI step. // Signature: TestChecklistFailureHostUnknown(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestChecklistFailureHostUnknown(t *testing.T) { orch := &Orchestrator{ cfg: config.Config{ Startup: config.Startup{ ServiceChecklist: []config.ServiceChecklistCheck{ { Name: "grafana-api", URL: "https://metrics.bstein.dev/api/health", }, }, }, }, } if got := orch.checklistFailureHost("grafana-api: tcp timeout"); got != "metrics.bstein.dev" { t.Fatalf("expected metrics host from configured URL, got %q", got) } if got := orch.checklistFailureHost("some-unmapped-check: fail"); got != "" { t.Fatalf("expected empty host for unknown check, got %q", got) } } // TestStuckVaultInitReasonDetectsHungInit runs one orchestration or CLI step. // Signature: TestStuckVaultInitReasonDetectsHungInit(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) { var pod podResource pod.Status.Phase = "Pending" pod.Metadata.Annotations = map[string]string{ "vault.hashicorp.com/agent-inject": "true", } pod.Status.InitContainerStatuses = []podContainerStatus{ { Name: "vault-agent-init", State: podContainerState{ Running: &podContainerRunningState{ StartedAt: time.Now().Add(-10 * time.Minute), }, }, }, } reason := stuckVaultInitReason(pod, 3*time.Minute) if reason != "VaultInitStuck" { t.Fatalf("expected VaultInitStuck reason, got %q", reason) } } // TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods runs one orchestration or CLI step. // Signature: TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T). // Why: keeps behavior explicit so startup/shutdown workflows remain maintainable as services evolve. func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) { var pod podResource pod.Status.Phase = "Pending" pod.Metadata.Annotations = map[string]string{ "vault.hashicorp.com/agent-inject": "true", } pod.Status.InitContainerStatuses = []podContainerStatus{ { Name: "vault-agent-init", State: podContainerState{ Running: &podContainerRunningState{ StartedAt: time.Now().Add(-30 * time.Second), }, }, }, } if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" { t.Fatalf("expected no reason for fresh init, got %q", reason) } pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false" pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute) if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" { t.Fatalf("expected no reason for non-vault pod, got %q", reason) } }