startup: auto-heal stuck vault-init and broaden external checks

This commit is contained in:
Brad Stein 2026-04-07 14:19:48 -03:00
parent 78faf9a123
commit 00a2528908
7 changed files with 183 additions and 8 deletions

View File

@ -28,7 +28,7 @@ Ananke does **not** stop at “Flux says Ready”. Startup only completes when a
- Flux source drift guard passes (`expected_flux_source_url` + branch expectation)
- Flux kustomizations are healthy
- controller convergence is healthy (deployments/statefulsets/daemonsets)
- external service checklist passes (for example Gitea + Grafana health endpoints)
- external service checklist passes (Gitea, Grafana, Keycloak OIDC, Harbor registry auth challenge, Longhorn auth redirect)
- stability soak window passes (no regressions, no CrashLoop/ImagePull failures)
If any gate fails, startup is blocked with a concrete reason.

View File

@ -89,6 +89,21 @@ startup:
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
- name: keycloak-oidc
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
accepted_statuses: [200]
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
timeout_seconds: 12
- name: harbor-registry
url: https://registry.bstein.dev/v2/
accepted_statuses: [401]
body_contains: unauthorized
timeout_seconds: 12
- name: longhorn-auth
url: https://longhorn.bstein.dev/
accepted_statuses: [302]
body_contains: openid-connect/auth
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5

View File

@ -155,6 +155,21 @@ startup:
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
- name: keycloak-oidc
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
accepted_statuses: [200]
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
timeout_seconds: 12
- name: harbor-registry
url: https://registry.bstein.dev/v2/
accepted_statuses: [401]
body_contains: unauthorized
timeout_seconds: 12
- name: longhorn-auth
url: https://longhorn.bstein.dev/
accepted_statuses: [302]
body_contains: openid-connect/auth
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5

View File

@ -155,6 +155,21 @@ startup:
accepted_statuses: [200]
body_contains: '"database":"ok"'
timeout_seconds: 12
- name: keycloak-oidc
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
accepted_statuses: [200]
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
timeout_seconds: 12
- name: harbor-registry
url: https://registry.bstein.dev/v2/
accepted_statuses: [401]
body_contains: unauthorized
timeout_seconds: 12
- name: longhorn-auth
url: https://longhorn.bstein.dev/
accepted_statuses: [302]
body_contains: openid-connect/auth
timeout_seconds: 12
require_flux_health: true
flux_health_wait_seconds: 900
flux_health_poll_seconds: 5

View File

@ -2010,10 +2010,11 @@ type podList struct {
type podResource struct {
Metadata struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
CreationTimestamp time.Time `json:"creationTimestamp"`
OwnerReferences []ownerReference `json:"ownerReferences"`
Namespace string `json:"namespace"`
Name string `json:"name"`
Annotations map[string]string `json:"annotations"`
CreationTimestamp time.Time `json:"creationTimestamp"`
OwnerReferences []ownerReference `json:"ownerReferences"`
} `json:"metadata"`
Spec struct {
NodeName string `json:"nodeName"`
@ -2031,17 +2032,23 @@ type ownerReference struct {
}
type podContainerStatus struct {
Name string `json:"name"`
State podContainerState `json:"state"`
}
type podContainerState struct {
Waiting *podContainerWaitingState `json:"waiting"`
Running *podContainerRunningState `json:"running"`
}
type podContainerWaitingState struct {
Reason string `json:"reason"`
}
type podContainerRunningState struct {
StartedAt time.Time `json:"startedAt"`
}
type podSpec struct {
NodeSelector map[string]string `json:"nodeSelector"`
Affinity *podAffinity `json:"affinity"`
@ -2159,7 +2166,9 @@ func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
deadline := time.Now().Add(wait)
lastFailure := "unknown"
lastLogged := time.Time{}
lastRecycleAttempt := time.Time{}
for {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
prevFailure := lastFailure
ready, detail := o.serviceChecklistReady(ctx)
lastFailure = detail
@ -2310,8 +2319,10 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
}
deadline := time.Now().Add(window)
lastStatus := time.Time{}
lastRecycleAttempt := time.Time{}
for {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
if err := o.startupStabilityHealthy(ctx); err != nil {
return fmt.Errorf("startup stability window failed: %w", err)
}
@ -2383,7 +2394,9 @@ func (o *Orchestrator) waitForFluxHealth(ctx context.Context) error {
lastFailure := "unknown"
lastLogged := time.Time{}
lastImmutableHealAttempt := time.Time{}
lastRecycleAttempt := time.Time{}
for {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
prevFailure := lastFailure
ready, detail, err := o.fluxHealthReady(ctx)
if err != nil {
@ -2556,11 +2569,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
deadline := time.Now().Add(wait)
lastFailure := "unknown"
lastLogged := time.Time{}
lastRecycleAttempt := time.Time{}
for {
prevFailure := lastFailure
if o.cfg.Startup.AutoRecycleStuckPods {
o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) })
}
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
ready, detail, err := o.workloadConvergenceReady(ctx)
if err != nil {
lastFailure = err.Error()
@ -2708,6 +2720,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
continue
}
reason := stuckContainerReason(pod, stuckReasons)
if reason == "" {
reason = stuckVaultInitReason(pod, grace)
}
if reason == "" {
continue
}
@ -2757,6 +2772,43 @@ func stuckContainerReason(p podResource, reasons map[string]struct{}) string {
return check(p.Status.ContainerStatuses)
}
func stuckVaultInitReason(p podResource, grace time.Duration) string {
if !strings.EqualFold(strings.TrimSpace(p.Status.Phase), "Pending") {
return ""
}
if !strings.EqualFold(strings.TrimSpace(p.Metadata.Annotations["vault.hashicorp.com/agent-inject"]), "true") {
return ""
}
for _, st := range p.Status.InitContainerStatuses {
if strings.TrimSpace(st.Name) != "vault-agent-init" || st.State.Running == nil {
continue
}
startedAt := st.State.Running.StartedAt
if startedAt.IsZero() {
continue
}
if time.Since(startedAt) < grace {
return ""
}
return "VaultInitStuck"
}
return ""
}
func (o *Orchestrator) maybeAutoRecycleStuckPods(ctx context.Context, lastAttempt *time.Time) {
if o.runner.DryRun || !o.cfg.Startup.AutoRecycleStuckPods {
return
}
now := time.Now()
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
return
}
if lastAttempt != nil {
*lastAttempt = now
}
o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) })
}
func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) {
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
@ -2777,6 +2829,10 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
"CreateContainerError": {},
"RunContainerError": {},
}
grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second
if grace <= 0 {
grace = 180 * time.Second
}
failures := []string{}
for _, pod := range list.Items {
@ -2792,6 +2848,9 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
continue
}
reason := stuckContainerReason(pod, stuckReasons)
if reason == "" {
reason = stuckVaultInitReason(pod, grace)
}
if reason == "" {
continue
}

View File

@ -214,3 +214,53 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail)
}
}
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
var pod podResource
pod.Status.Phase = "Pending"
pod.Metadata.Annotations = map[string]string{
"vault.hashicorp.com/agent-inject": "true",
}
pod.Status.InitContainerStatuses = []podContainerStatus{
{
Name: "vault-agent-init",
State: podContainerState{
Running: &podContainerRunningState{
StartedAt: time.Now().Add(-10 * time.Minute),
},
},
},
}
reason := stuckVaultInitReason(pod, 3*time.Minute)
if reason != "VaultInitStuck" {
t.Fatalf("expected VaultInitStuck reason, got %q", reason)
}
}
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
var pod podResource
pod.Status.Phase = "Pending"
pod.Metadata.Annotations = map[string]string{
"vault.hashicorp.com/agent-inject": "true",
}
pod.Status.InitContainerStatuses = []podContainerStatus{
{
Name: "vault-agent-init",
State: podContainerState{
Running: &podContainerRunningState{
StartedAt: time.Now().Add(-30 * time.Second),
},
},
},
}
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
t.Fatalf("expected no reason for fresh init, got %q", reason)
}
pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
t.Fatalf("expected no reason for non-vault pod, got %q", reason)
}
}

View File

@ -492,6 +492,27 @@ func defaults() Config {
BodyContains: "\"database\":\"ok\"",
TimeoutSeconds: 12,
},
{
Name: "keycloak-oidc",
URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
AcceptedStatuses: []int{200},
BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
TimeoutSeconds: 12,
},
{
Name: "harbor-registry",
URL: "https://registry.bstein.dev/v2/",
AcceptedStatuses: []int{401},
BodyContains: "unauthorized",
TimeoutSeconds: 12,
},
{
Name: "longhorn-auth",
URL: "https://longhorn.bstein.dev/",
AcceptedStatuses: []int{302},
BodyContains: "openid-connect/auth",
TimeoutSeconds: 12,
},
},
RequireFluxHealth: true,
FluxHealthWaitSeconds: 900,