diff --git a/README.md b/README.md index ed83dbb..4ab8923 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Ananke does **not** stop at “Flux says Ready”. Startup only completes when a - Flux source drift guard passes (`expected_flux_source_url` + branch expectation) - Flux kustomizations are healthy - controller convergence is healthy (deployments/statefulsets/daemonsets) -- external service checklist passes (for example Gitea + Grafana health endpoints) +- external service checklist passes (Gitea, Grafana, Keycloak OIDC, Harbor registry auth challenge, Longhorn auth redirect) - stability soak window passes (no regressions, no CrashLoop/ImagePull failures) If any gate fails, startup is blocked with a concrete reason. diff --git a/configs/ananke.example.yaml b/configs/ananke.example.yaml index 9d0f4ae..2776007 100644 --- a/configs/ananke.example.yaml +++ b/configs/ananke.example.yaml @@ -89,6 +89,21 @@ startup: accepted_statuses: [200] body_contains: '"database":"ok"' timeout_seconds: 12 + - name: keycloak-oidc + url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration + accepted_statuses: [200] + body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"' + timeout_seconds: 12 + - name: harbor-registry + url: https://registry.bstein.dev/v2/ + accepted_statuses: [401] + body_contains: unauthorized + timeout_seconds: 12 + - name: longhorn-auth + url: https://longhorn.bstein.dev/ + accepted_statuses: [302] + body_contains: openid-connect/auth + timeout_seconds: 12 require_flux_health: true flux_health_wait_seconds: 900 flux_health_poll_seconds: 5 diff --git a/configs/ananke.tethys.yaml b/configs/ananke.tethys.yaml index 3eb52bf..6d6208c 100644 --- a/configs/ananke.tethys.yaml +++ b/configs/ananke.tethys.yaml @@ -155,6 +155,21 @@ startup: accepted_statuses: [200] body_contains: '"database":"ok"' timeout_seconds: 12 + - name: keycloak-oidc + url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration + accepted_statuses: [200] + body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"' + timeout_seconds: 12 + - name: harbor-registry + url: https://registry.bstein.dev/v2/ + accepted_statuses: [401] + body_contains: unauthorized + timeout_seconds: 12 + - name: longhorn-auth + url: https://longhorn.bstein.dev/ + accepted_statuses: [302] + body_contains: openid-connect/auth + timeout_seconds: 12 require_flux_health: true flux_health_wait_seconds: 900 flux_health_poll_seconds: 5 diff --git a/configs/ananke.titan-db.yaml b/configs/ananke.titan-db.yaml index c3d447c..70b7ad4 100644 --- a/configs/ananke.titan-db.yaml +++ b/configs/ananke.titan-db.yaml @@ -155,6 +155,21 @@ startup: accepted_statuses: [200] body_contains: '"database":"ok"' timeout_seconds: 12 + - name: keycloak-oidc + url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration + accepted_statuses: [200] + body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"' + timeout_seconds: 12 + - name: harbor-registry + url: https://registry.bstein.dev/v2/ + accepted_statuses: [401] + body_contains: unauthorized + timeout_seconds: 12 + - name: longhorn-auth + url: https://longhorn.bstein.dev/ + accepted_statuses: [302] + body_contains: openid-connect/auth + timeout_seconds: 12 require_flux_health: true flux_health_wait_seconds: 900 flux_health_poll_seconds: 5 diff --git a/internal/cluster/orchestrator.go b/internal/cluster/orchestrator.go index faf5840..eda3103 100644 --- a/internal/cluster/orchestrator.go +++ b/internal/cluster/orchestrator.go @@ -2010,10 +2010,11 @@ type podList struct { type podResource struct { Metadata struct { - Namespace string `json:"namespace"` - Name string `json:"name"` - CreationTimestamp time.Time `json:"creationTimestamp"` - OwnerReferences []ownerReference `json:"ownerReferences"` + Namespace string `json:"namespace"` + Name string `json:"name"` + Annotations map[string]string `json:"annotations"` + CreationTimestamp time.Time `json:"creationTimestamp"` + OwnerReferences []ownerReference `json:"ownerReferences"` } `json:"metadata"` Spec struct { NodeName string `json:"nodeName"` @@ -2031,17 +2032,23 @@ type ownerReference struct { } type podContainerStatus struct { + Name string `json:"name"` State podContainerState `json:"state"` } type podContainerState struct { Waiting *podContainerWaitingState `json:"waiting"` + Running *podContainerRunningState `json:"running"` } type podContainerWaitingState struct { Reason string `json:"reason"` } +type podContainerRunningState struct { + StartedAt time.Time `json:"startedAt"` +} + type podSpec struct { NodeSelector map[string]string `json:"nodeSelector"` Affinity *podAffinity `json:"affinity"` @@ -2159,7 +2166,9 @@ func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error { deadline := time.Now().Add(wait) lastFailure := "unknown" lastLogged := time.Time{} + lastRecycleAttempt := time.Time{} for { + o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) prevFailure := lastFailure ready, detail := o.serviceChecklistReady(ctx) lastFailure = detail @@ -2310,8 +2319,10 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error { } deadline := time.Now().Add(window) lastStatus := time.Time{} + lastRecycleAttempt := time.Time{} for { + o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) if err := o.startupStabilityHealthy(ctx); err != nil { return fmt.Errorf("startup stability window failed: %w", err) } @@ -2383,7 +2394,9 @@ func (o *Orchestrator) waitForFluxHealth(ctx context.Context) error { lastFailure := "unknown" lastLogged := time.Time{} lastImmutableHealAttempt := time.Time{} + lastRecycleAttempt := time.Time{} for { + o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) prevFailure := lastFailure ready, detail, err := o.fluxHealthReady(ctx) if err != nil { @@ -2556,11 +2569,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error { deadline := time.Now().Add(wait) lastFailure := "unknown" lastLogged := time.Time{} + lastRecycleAttempt := time.Time{} for { prevFailure := lastFailure - if o.cfg.Startup.AutoRecycleStuckPods { - o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) }) - } + o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) ready, detail, err := o.workloadConvergenceReady(ctx) if err != nil { lastFailure = err.Error() @@ -2708,6 +2720,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { continue } reason := stuckContainerReason(pod, stuckReasons) + if reason == "" { + reason = stuckVaultInitReason(pod, grace) + } if reason == "" { continue } @@ -2757,6 +2772,43 @@ func stuckContainerReason(p podResource, reasons map[string]struct{}) string { return check(p.Status.ContainerStatuses) } +func stuckVaultInitReason(p podResource, grace time.Duration) string { + if !strings.EqualFold(strings.TrimSpace(p.Status.Phase), "Pending") { + return "" + } + if !strings.EqualFold(strings.TrimSpace(p.Metadata.Annotations["vault.hashicorp.com/agent-inject"]), "true") { + return "" + } + for _, st := range p.Status.InitContainerStatuses { + if strings.TrimSpace(st.Name) != "vault-agent-init" || st.State.Running == nil { + continue + } + startedAt := st.State.Running.StartedAt + if startedAt.IsZero() { + continue + } + if time.Since(startedAt) < grace { + return "" + } + return "VaultInitStuck" + } + return "" +} + +func (o *Orchestrator) maybeAutoRecycleStuckPods(ctx context.Context, lastAttempt *time.Time) { + if o.runner.DryRun || !o.cfg.Startup.AutoRecycleStuckPods { + return + } + now := time.Now() + if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second { + return + } + if lastAttempt != nil { + *lastAttempt = now + } + o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) }) +} + func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) { out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") if err != nil { @@ -2777,6 +2829,10 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) "CreateContainerError": {}, "RunContainerError": {}, } + grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second + if grace <= 0 { + grace = 180 * time.Second + } failures := []string{} for _, pod := range list.Items { @@ -2792,6 +2848,9 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) continue } reason := stuckContainerReason(pod, stuckReasons) + if reason == "" { + reason = stuckVaultInitReason(pod, grace) + } if reason == "" { continue } diff --git a/internal/cluster/orchestrator_test.go b/internal/cluster/orchestrator_test.go index 72a9fca..b4422c9 100644 --- a/internal/cluster/orchestrator_test.go +++ b/internal/cluster/orchestrator_test.go @@ -214,3 +214,53 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) { t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail) } } + +func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) { + var pod podResource + pod.Status.Phase = "Pending" + pod.Metadata.Annotations = map[string]string{ + "vault.hashicorp.com/agent-inject": "true", + } + pod.Status.InitContainerStatuses = []podContainerStatus{ + { + Name: "vault-agent-init", + State: podContainerState{ + Running: &podContainerRunningState{ + StartedAt: time.Now().Add(-10 * time.Minute), + }, + }, + }, + } + + reason := stuckVaultInitReason(pod, 3*time.Minute) + if reason != "VaultInitStuck" { + t.Fatalf("expected VaultInitStuck reason, got %q", reason) + } +} + +func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) { + var pod podResource + pod.Status.Phase = "Pending" + pod.Metadata.Annotations = map[string]string{ + "vault.hashicorp.com/agent-inject": "true", + } + pod.Status.InitContainerStatuses = []podContainerStatus{ + { + Name: "vault-agent-init", + State: podContainerState{ + Running: &podContainerRunningState{ + StartedAt: time.Now().Add(-30 * time.Second), + }, + }, + }, + } + if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" { + t.Fatalf("expected no reason for fresh init, got %q", reason) + } + + pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false" + pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute) + if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" { + t.Fatalf("expected no reason for non-vault pod, got %q", reason) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index d452fe1..a539498 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -492,6 +492,27 @@ func defaults() Config { BodyContains: "\"database\":\"ok\"", TimeoutSeconds: 12, }, + { + Name: "keycloak-oidc", + URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration", + AcceptedStatuses: []int{200}, + BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"", + TimeoutSeconds: 12, + }, + { + Name: "harbor-registry", + URL: "https://registry.bstein.dev/v2/", + AcceptedStatuses: []int{401}, + BodyContains: "unauthorized", + TimeoutSeconds: 12, + }, + { + Name: "longhorn-auth", + URL: "https://longhorn.bstein.dev/", + AcceptedStatuses: []int{302}, + BodyContains: "openid-connect/auth", + TimeoutSeconds: 12, + }, }, RequireFluxHealth: true, FluxHealthWaitSeconds: 900,