startup: auto-heal stuck vault-init and broaden external checks
This commit is contained in:
parent
78faf9a123
commit
00a2528908
@ -28,7 +28,7 @@ Ananke does **not** stop at “Flux says Ready”. Startup only completes when a
|
||||
- Flux source drift guard passes (`expected_flux_source_url` + branch expectation)
|
||||
- Flux kustomizations are healthy
|
||||
- controller convergence is healthy (deployments/statefulsets/daemonsets)
|
||||
- external service checklist passes (for example Gitea + Grafana health endpoints)
|
||||
- external service checklist passes (Gitea, Grafana, Keycloak OIDC, Harbor registry auth challenge, Longhorn auth redirect)
|
||||
- stability soak window passes (no regressions, no CrashLoop/ImagePull failures)
|
||||
|
||||
If any gate fails, startup is blocked with a concrete reason.
|
||||
|
||||
@ -89,6 +89,21 @@ startup:
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"database":"ok"'
|
||||
timeout_seconds: 12
|
||||
- name: keycloak-oidc
|
||||
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
|
||||
timeout_seconds: 12
|
||||
- name: harbor-registry
|
||||
url: https://registry.bstein.dev/v2/
|
||||
accepted_statuses: [401]
|
||||
body_contains: unauthorized
|
||||
timeout_seconds: 12
|
||||
- name: longhorn-auth
|
||||
url: https://longhorn.bstein.dev/
|
||||
accepted_statuses: [302]
|
||||
body_contains: openid-connect/auth
|
||||
timeout_seconds: 12
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
|
||||
@ -155,6 +155,21 @@ startup:
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"database":"ok"'
|
||||
timeout_seconds: 12
|
||||
- name: keycloak-oidc
|
||||
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
|
||||
timeout_seconds: 12
|
||||
- name: harbor-registry
|
||||
url: https://registry.bstein.dev/v2/
|
||||
accepted_statuses: [401]
|
||||
body_contains: unauthorized
|
||||
timeout_seconds: 12
|
||||
- name: longhorn-auth
|
||||
url: https://longhorn.bstein.dev/
|
||||
accepted_statuses: [302]
|
||||
body_contains: openid-connect/auth
|
||||
timeout_seconds: 12
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
|
||||
@ -155,6 +155,21 @@ startup:
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"database":"ok"'
|
||||
timeout_seconds: 12
|
||||
- name: keycloak-oidc
|
||||
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||
accepted_statuses: [200]
|
||||
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
|
||||
timeout_seconds: 12
|
||||
- name: harbor-registry
|
||||
url: https://registry.bstein.dev/v2/
|
||||
accepted_statuses: [401]
|
||||
body_contains: unauthorized
|
||||
timeout_seconds: 12
|
||||
- name: longhorn-auth
|
||||
url: https://longhorn.bstein.dev/
|
||||
accepted_statuses: [302]
|
||||
body_contains: openid-connect/auth
|
||||
timeout_seconds: 12
|
||||
require_flux_health: true
|
||||
flux_health_wait_seconds: 900
|
||||
flux_health_poll_seconds: 5
|
||||
|
||||
@ -2010,10 +2010,11 @@ type podList struct {
|
||||
|
||||
type podResource struct {
|
||||
Metadata struct {
|
||||
Namespace string `json:"namespace"`
|
||||
Name string `json:"name"`
|
||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
||||
OwnerReferences []ownerReference `json:"ownerReferences"`
|
||||
Namespace string `json:"namespace"`
|
||||
Name string `json:"name"`
|
||||
Annotations map[string]string `json:"annotations"`
|
||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
||||
OwnerReferences []ownerReference `json:"ownerReferences"`
|
||||
} `json:"metadata"`
|
||||
Spec struct {
|
||||
NodeName string `json:"nodeName"`
|
||||
@ -2031,17 +2032,23 @@ type ownerReference struct {
|
||||
}
|
||||
|
||||
type podContainerStatus struct {
|
||||
Name string `json:"name"`
|
||||
State podContainerState `json:"state"`
|
||||
}
|
||||
|
||||
type podContainerState struct {
|
||||
Waiting *podContainerWaitingState `json:"waiting"`
|
||||
Running *podContainerRunningState `json:"running"`
|
||||
}
|
||||
|
||||
type podContainerWaitingState struct {
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
type podContainerRunningState struct {
|
||||
StartedAt time.Time `json:"startedAt"`
|
||||
}
|
||||
|
||||
type podSpec struct {
|
||||
NodeSelector map[string]string `json:"nodeSelector"`
|
||||
Affinity *podAffinity `json:"affinity"`
|
||||
@ -2159,7 +2166,9 @@ func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
|
||||
deadline := time.Now().Add(wait)
|
||||
lastFailure := "unknown"
|
||||
lastLogged := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
for {
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
prevFailure := lastFailure
|
||||
ready, detail := o.serviceChecklistReady(ctx)
|
||||
lastFailure = detail
|
||||
@ -2310,8 +2319,10 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
||||
}
|
||||
deadline := time.Now().Add(window)
|
||||
lastStatus := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
|
||||
for {
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
if err := o.startupStabilityHealthy(ctx); err != nil {
|
||||
return fmt.Errorf("startup stability window failed: %w", err)
|
||||
}
|
||||
@ -2383,7 +2394,9 @@ func (o *Orchestrator) waitForFluxHealth(ctx context.Context) error {
|
||||
lastFailure := "unknown"
|
||||
lastLogged := time.Time{}
|
||||
lastImmutableHealAttempt := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
for {
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
prevFailure := lastFailure
|
||||
ready, detail, err := o.fluxHealthReady(ctx)
|
||||
if err != nil {
|
||||
@ -2556,11 +2569,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
|
||||
deadline := time.Now().Add(wait)
|
||||
lastFailure := "unknown"
|
||||
lastLogged := time.Time{}
|
||||
lastRecycleAttempt := time.Time{}
|
||||
for {
|
||||
prevFailure := lastFailure
|
||||
if o.cfg.Startup.AutoRecycleStuckPods {
|
||||
o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) })
|
||||
}
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
ready, detail, err := o.workloadConvergenceReady(ctx)
|
||||
if err != nil {
|
||||
lastFailure = err.Error()
|
||||
@ -2708,6 +2720,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
continue
|
||||
}
|
||||
reason := stuckContainerReason(pod, stuckReasons)
|
||||
if reason == "" {
|
||||
reason = stuckVaultInitReason(pod, grace)
|
||||
}
|
||||
if reason == "" {
|
||||
continue
|
||||
}
|
||||
@ -2757,6 +2772,43 @@ func stuckContainerReason(p podResource, reasons map[string]struct{}) string {
|
||||
return check(p.Status.ContainerStatuses)
|
||||
}
|
||||
|
||||
func stuckVaultInitReason(p podResource, grace time.Duration) string {
|
||||
if !strings.EqualFold(strings.TrimSpace(p.Status.Phase), "Pending") {
|
||||
return ""
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(p.Metadata.Annotations["vault.hashicorp.com/agent-inject"]), "true") {
|
||||
return ""
|
||||
}
|
||||
for _, st := range p.Status.InitContainerStatuses {
|
||||
if strings.TrimSpace(st.Name) != "vault-agent-init" || st.State.Running == nil {
|
||||
continue
|
||||
}
|
||||
startedAt := st.State.Running.StartedAt
|
||||
if startedAt.IsZero() {
|
||||
continue
|
||||
}
|
||||
if time.Since(startedAt) < grace {
|
||||
return ""
|
||||
}
|
||||
return "VaultInitStuck"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (o *Orchestrator) maybeAutoRecycleStuckPods(ctx context.Context, lastAttempt *time.Time) {
|
||||
if o.runner.DryRun || !o.cfg.Startup.AutoRecycleStuckPods {
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
|
||||
return
|
||||
}
|
||||
if lastAttempt != nil {
|
||||
*lastAttempt = now
|
||||
}
|
||||
o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) })
|
||||
}
|
||||
|
||||
func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) {
|
||||
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
@ -2777,6 +2829,10 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
||||
"CreateContainerError": {},
|
||||
"RunContainerError": {},
|
||||
}
|
||||
grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second
|
||||
if grace <= 0 {
|
||||
grace = 180 * time.Second
|
||||
}
|
||||
|
||||
failures := []string{}
|
||||
for _, pod := range list.Items {
|
||||
@ -2792,6 +2848,9 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
||||
continue
|
||||
}
|
||||
reason := stuckContainerReason(pod, stuckReasons)
|
||||
if reason == "" {
|
||||
reason = stuckVaultInitReason(pod, grace)
|
||||
}
|
||||
if reason == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
@ -214,3 +214,53 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
|
||||
t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
|
||||
var pod podResource
|
||||
pod.Status.Phase = "Pending"
|
||||
pod.Metadata.Annotations = map[string]string{
|
||||
"vault.hashicorp.com/agent-inject": "true",
|
||||
}
|
||||
pod.Status.InitContainerStatuses = []podContainerStatus{
|
||||
{
|
||||
Name: "vault-agent-init",
|
||||
State: podContainerState{
|
||||
Running: &podContainerRunningState{
|
||||
StartedAt: time.Now().Add(-10 * time.Minute),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
reason := stuckVaultInitReason(pod, 3*time.Minute)
|
||||
if reason != "VaultInitStuck" {
|
||||
t.Fatalf("expected VaultInitStuck reason, got %q", reason)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
|
||||
var pod podResource
|
||||
pod.Status.Phase = "Pending"
|
||||
pod.Metadata.Annotations = map[string]string{
|
||||
"vault.hashicorp.com/agent-inject": "true",
|
||||
}
|
||||
pod.Status.InitContainerStatuses = []podContainerStatus{
|
||||
{
|
||||
Name: "vault-agent-init",
|
||||
State: podContainerState{
|
||||
Running: &podContainerRunningState{
|
||||
StartedAt: time.Now().Add(-30 * time.Second),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
|
||||
t.Fatalf("expected no reason for fresh init, got %q", reason)
|
||||
}
|
||||
|
||||
pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
|
||||
pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
|
||||
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
|
||||
t.Fatalf("expected no reason for non-vault pod, got %q", reason)
|
||||
}
|
||||
}
|
||||
|
||||
@ -492,6 +492,27 @@ func defaults() Config {
|
||||
BodyContains: "\"database\":\"ok\"",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "keycloak-oidc",
|
||||
URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||
AcceptedStatuses: []int{200},
|
||||
BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "harbor-registry",
|
||||
URL: "https://registry.bstein.dev/v2/",
|
||||
AcceptedStatuses: []int{401},
|
||||
BodyContains: "unauthorized",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
{
|
||||
Name: "longhorn-auth",
|
||||
URL: "https://longhorn.bstein.dev/",
|
||||
AcceptedStatuses: []int{302},
|
||||
BodyContains: "openid-connect/auth",
|
||||
TimeoutSeconds: 12,
|
||||
},
|
||||
},
|
||||
RequireFluxHealth: true,
|
||||
FluxHealthWaitSeconds: 900,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user