startup: auto-heal stuck vault-init and broaden external checks
This commit is contained in:
parent
78faf9a123
commit
00a2528908
@ -28,7 +28,7 @@ Ananke does **not** stop at “Flux says Ready”. Startup only completes when a
|
|||||||
- Flux source drift guard passes (`expected_flux_source_url` + branch expectation)
|
- Flux source drift guard passes (`expected_flux_source_url` + branch expectation)
|
||||||
- Flux kustomizations are healthy
|
- Flux kustomizations are healthy
|
||||||
- controller convergence is healthy (deployments/statefulsets/daemonsets)
|
- controller convergence is healthy (deployments/statefulsets/daemonsets)
|
||||||
- external service checklist passes (for example Gitea + Grafana health endpoints)
|
- external service checklist passes (Gitea, Grafana, Keycloak OIDC, Harbor registry auth challenge, Longhorn auth redirect)
|
||||||
- stability soak window passes (no regressions, no CrashLoop/ImagePull failures)
|
- stability soak window passes (no regressions, no CrashLoop/ImagePull failures)
|
||||||
|
|
||||||
If any gate fails, startup is blocked with a concrete reason.
|
If any gate fails, startup is blocked with a concrete reason.
|
||||||
|
|||||||
@ -89,6 +89,21 @@ startup:
|
|||||||
accepted_statuses: [200]
|
accepted_statuses: [200]
|
||||||
body_contains: '"database":"ok"'
|
body_contains: '"database":"ok"'
|
||||||
timeout_seconds: 12
|
timeout_seconds: 12
|
||||||
|
- name: keycloak-oidc
|
||||||
|
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: harbor-registry
|
||||||
|
url: https://registry.bstein.dev/v2/
|
||||||
|
accepted_statuses: [401]
|
||||||
|
body_contains: unauthorized
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: longhorn-auth
|
||||||
|
url: https://longhorn.bstein.dev/
|
||||||
|
accepted_statuses: [302]
|
||||||
|
body_contains: openid-connect/auth
|
||||||
|
timeout_seconds: 12
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
|
|||||||
@ -155,6 +155,21 @@ startup:
|
|||||||
accepted_statuses: [200]
|
accepted_statuses: [200]
|
||||||
body_contains: '"database":"ok"'
|
body_contains: '"database":"ok"'
|
||||||
timeout_seconds: 12
|
timeout_seconds: 12
|
||||||
|
- name: keycloak-oidc
|
||||||
|
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: harbor-registry
|
||||||
|
url: https://registry.bstein.dev/v2/
|
||||||
|
accepted_statuses: [401]
|
||||||
|
body_contains: unauthorized
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: longhorn-auth
|
||||||
|
url: https://longhorn.bstein.dev/
|
||||||
|
accepted_statuses: [302]
|
||||||
|
body_contains: openid-connect/auth
|
||||||
|
timeout_seconds: 12
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
|
|||||||
@ -155,6 +155,21 @@ startup:
|
|||||||
accepted_statuses: [200]
|
accepted_statuses: [200]
|
||||||
body_contains: '"database":"ok"'
|
body_contains: '"database":"ok"'
|
||||||
timeout_seconds: 12
|
timeout_seconds: 12
|
||||||
|
- name: keycloak-oidc
|
||||||
|
url: https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration
|
||||||
|
accepted_statuses: [200]
|
||||||
|
body_contains: '"issuer":"https://sso.bstein.dev/realms/atlas"'
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: harbor-registry
|
||||||
|
url: https://registry.bstein.dev/v2/
|
||||||
|
accepted_statuses: [401]
|
||||||
|
body_contains: unauthorized
|
||||||
|
timeout_seconds: 12
|
||||||
|
- name: longhorn-auth
|
||||||
|
url: https://longhorn.bstein.dev/
|
||||||
|
accepted_statuses: [302]
|
||||||
|
body_contains: openid-connect/auth
|
||||||
|
timeout_seconds: 12
|
||||||
require_flux_health: true
|
require_flux_health: true
|
||||||
flux_health_wait_seconds: 900
|
flux_health_wait_seconds: 900
|
||||||
flux_health_poll_seconds: 5
|
flux_health_poll_seconds: 5
|
||||||
|
|||||||
@ -2010,10 +2010,11 @@ type podList struct {
|
|||||||
|
|
||||||
type podResource struct {
|
type podResource struct {
|
||||||
Metadata struct {
|
Metadata struct {
|
||||||
Namespace string `json:"namespace"`
|
Namespace string `json:"namespace"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
CreationTimestamp time.Time `json:"creationTimestamp"`
|
Annotations map[string]string `json:"annotations"`
|
||||||
OwnerReferences []ownerReference `json:"ownerReferences"`
|
CreationTimestamp time.Time `json:"creationTimestamp"`
|
||||||
|
OwnerReferences []ownerReference `json:"ownerReferences"`
|
||||||
} `json:"metadata"`
|
} `json:"metadata"`
|
||||||
Spec struct {
|
Spec struct {
|
||||||
NodeName string `json:"nodeName"`
|
NodeName string `json:"nodeName"`
|
||||||
@ -2031,17 +2032,23 @@ type ownerReference struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type podContainerStatus struct {
|
type podContainerStatus struct {
|
||||||
|
Name string `json:"name"`
|
||||||
State podContainerState `json:"state"`
|
State podContainerState `json:"state"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type podContainerState struct {
|
type podContainerState struct {
|
||||||
Waiting *podContainerWaitingState `json:"waiting"`
|
Waiting *podContainerWaitingState `json:"waiting"`
|
||||||
|
Running *podContainerRunningState `json:"running"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type podContainerWaitingState struct {
|
type podContainerWaitingState struct {
|
||||||
Reason string `json:"reason"`
|
Reason string `json:"reason"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type podContainerRunningState struct {
|
||||||
|
StartedAt time.Time `json:"startedAt"`
|
||||||
|
}
|
||||||
|
|
||||||
type podSpec struct {
|
type podSpec struct {
|
||||||
NodeSelector map[string]string `json:"nodeSelector"`
|
NodeSelector map[string]string `json:"nodeSelector"`
|
||||||
Affinity *podAffinity `json:"affinity"`
|
Affinity *podAffinity `json:"affinity"`
|
||||||
@ -2159,7 +2166,9 @@ func (o *Orchestrator) waitForServiceChecklist(ctx context.Context) error {
|
|||||||
deadline := time.Now().Add(wait)
|
deadline := time.Now().Add(wait)
|
||||||
lastFailure := "unknown"
|
lastFailure := "unknown"
|
||||||
lastLogged := time.Time{}
|
lastLogged := time.Time{}
|
||||||
|
lastRecycleAttempt := time.Time{}
|
||||||
for {
|
for {
|
||||||
|
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||||
prevFailure := lastFailure
|
prevFailure := lastFailure
|
||||||
ready, detail := o.serviceChecklistReady(ctx)
|
ready, detail := o.serviceChecklistReady(ctx)
|
||||||
lastFailure = detail
|
lastFailure = detail
|
||||||
@ -2310,8 +2319,10 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
deadline := time.Now().Add(window)
|
deadline := time.Now().Add(window)
|
||||||
lastStatus := time.Time{}
|
lastStatus := time.Time{}
|
||||||
|
lastRecycleAttempt := time.Time{}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||||
if err := o.startupStabilityHealthy(ctx); err != nil {
|
if err := o.startupStabilityHealthy(ctx); err != nil {
|
||||||
return fmt.Errorf("startup stability window failed: %w", err)
|
return fmt.Errorf("startup stability window failed: %w", err)
|
||||||
}
|
}
|
||||||
@ -2383,7 +2394,9 @@ func (o *Orchestrator) waitForFluxHealth(ctx context.Context) error {
|
|||||||
lastFailure := "unknown"
|
lastFailure := "unknown"
|
||||||
lastLogged := time.Time{}
|
lastLogged := time.Time{}
|
||||||
lastImmutableHealAttempt := time.Time{}
|
lastImmutableHealAttempt := time.Time{}
|
||||||
|
lastRecycleAttempt := time.Time{}
|
||||||
for {
|
for {
|
||||||
|
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||||
prevFailure := lastFailure
|
prevFailure := lastFailure
|
||||||
ready, detail, err := o.fluxHealthReady(ctx)
|
ready, detail, err := o.fluxHealthReady(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -2556,11 +2569,10 @@ func (o *Orchestrator) waitForWorkloadConvergence(ctx context.Context) error {
|
|||||||
deadline := time.Now().Add(wait)
|
deadline := time.Now().Add(wait)
|
||||||
lastFailure := "unknown"
|
lastFailure := "unknown"
|
||||||
lastLogged := time.Time{}
|
lastLogged := time.Time{}
|
||||||
|
lastRecycleAttempt := time.Time{}
|
||||||
for {
|
for {
|
||||||
prevFailure := lastFailure
|
prevFailure := lastFailure
|
||||||
if o.cfg.Startup.AutoRecycleStuckPods {
|
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||||
o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) })
|
|
||||||
}
|
|
||||||
ready, detail, err := o.workloadConvergenceReady(ctx)
|
ready, detail, err := o.workloadConvergenceReady(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
lastFailure = err.Error()
|
lastFailure = err.Error()
|
||||||
@ -2708,6 +2720,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
reason := stuckContainerReason(pod, stuckReasons)
|
reason := stuckContainerReason(pod, stuckReasons)
|
||||||
|
if reason == "" {
|
||||||
|
reason = stuckVaultInitReason(pod, grace)
|
||||||
|
}
|
||||||
if reason == "" {
|
if reason == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -2757,6 +2772,43 @@ func stuckContainerReason(p podResource, reasons map[string]struct{}) string {
|
|||||||
return check(p.Status.ContainerStatuses)
|
return check(p.Status.ContainerStatuses)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func stuckVaultInitReason(p podResource, grace time.Duration) string {
|
||||||
|
if !strings.EqualFold(strings.TrimSpace(p.Status.Phase), "Pending") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if !strings.EqualFold(strings.TrimSpace(p.Metadata.Annotations["vault.hashicorp.com/agent-inject"]), "true") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
for _, st := range p.Status.InitContainerStatuses {
|
||||||
|
if strings.TrimSpace(st.Name) != "vault-agent-init" || st.State.Running == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
startedAt := st.State.Running.StartedAt
|
||||||
|
if startedAt.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if time.Since(startedAt) < grace {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return "VaultInitStuck"
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *Orchestrator) maybeAutoRecycleStuckPods(ctx context.Context, lastAttempt *time.Time) {
|
||||||
|
if o.runner.DryRun || !o.cfg.Startup.AutoRecycleStuckPods {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if lastAttempt != nil {
|
||||||
|
*lastAttempt = now
|
||||||
|
}
|
||||||
|
o.bestEffort("recycle stuck controller pods", func() error { return o.recycleStuckControllerPods(ctx) })
|
||||||
|
}
|
||||||
|
|
||||||
func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) {
|
func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error) {
|
||||||
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -2777,6 +2829,10 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
|||||||
"CreateContainerError": {},
|
"CreateContainerError": {},
|
||||||
"RunContainerError": {},
|
"RunContainerError": {},
|
||||||
}
|
}
|
||||||
|
grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second
|
||||||
|
if grace <= 0 {
|
||||||
|
grace = 180 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
failures := []string{}
|
failures := []string{}
|
||||||
for _, pod := range list.Items {
|
for _, pod := range list.Items {
|
||||||
@ -2792,6 +2848,9 @@ func (o *Orchestrator) startupFailurePods(ctx context.Context) ([]string, error)
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
reason := stuckContainerReason(pod, stuckReasons)
|
reason := stuckContainerReason(pod, stuckReasons)
|
||||||
|
if reason == "" {
|
||||||
|
reason = stuckVaultInitReason(pod, grace)
|
||||||
|
}
|
||||||
if reason == "" {
|
if reason == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@ -214,3 +214,53 @@ func TestServiceCheckReadyBodyContainsIgnoresWhitespace(t *testing.T) {
|
|||||||
t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail)
|
t.Fatalf("expected whitespace-tolerant service check to pass, detail=%s", detail)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestStuckVaultInitReasonDetectsHungInit(t *testing.T) {
|
||||||
|
var pod podResource
|
||||||
|
pod.Status.Phase = "Pending"
|
||||||
|
pod.Metadata.Annotations = map[string]string{
|
||||||
|
"vault.hashicorp.com/agent-inject": "true",
|
||||||
|
}
|
||||||
|
pod.Status.InitContainerStatuses = []podContainerStatus{
|
||||||
|
{
|
||||||
|
Name: "vault-agent-init",
|
||||||
|
State: podContainerState{
|
||||||
|
Running: &podContainerRunningState{
|
||||||
|
StartedAt: time.Now().Add(-10 * time.Minute),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
reason := stuckVaultInitReason(pod, 3*time.Minute)
|
||||||
|
if reason != "VaultInitStuck" {
|
||||||
|
t.Fatalf("expected VaultInitStuck reason, got %q", reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStuckVaultInitReasonIgnoresFreshOrNonVaultPods(t *testing.T) {
|
||||||
|
var pod podResource
|
||||||
|
pod.Status.Phase = "Pending"
|
||||||
|
pod.Metadata.Annotations = map[string]string{
|
||||||
|
"vault.hashicorp.com/agent-inject": "true",
|
||||||
|
}
|
||||||
|
pod.Status.InitContainerStatuses = []podContainerStatus{
|
||||||
|
{
|
||||||
|
Name: "vault-agent-init",
|
||||||
|
State: podContainerState{
|
||||||
|
Running: &podContainerRunningState{
|
||||||
|
StartedAt: time.Now().Add(-30 * time.Second),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
|
||||||
|
t.Fatalf("expected no reason for fresh init, got %q", reason)
|
||||||
|
}
|
||||||
|
|
||||||
|
pod.Metadata.Annotations["vault.hashicorp.com/agent-inject"] = "false"
|
||||||
|
pod.Status.InitContainerStatuses[0].State.Running.StartedAt = time.Now().Add(-10 * time.Minute)
|
||||||
|
if reason := stuckVaultInitReason(pod, 3*time.Minute); reason != "" {
|
||||||
|
t.Fatalf("expected no reason for non-vault pod, got %q", reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -492,6 +492,27 @@ func defaults() Config {
|
|||||||
BodyContains: "\"database\":\"ok\"",
|
BodyContains: "\"database\":\"ok\"",
|
||||||
TimeoutSeconds: 12,
|
TimeoutSeconds: 12,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Name: "keycloak-oidc",
|
||||||
|
URL: "https://sso.bstein.dev/realms/atlas/.well-known/openid-configuration",
|
||||||
|
AcceptedStatuses: []int{200},
|
||||||
|
BodyContains: "\"issuer\":\"https://sso.bstein.dev/realms/atlas\"",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "harbor-registry",
|
||||||
|
URL: "https://registry.bstein.dev/v2/",
|
||||||
|
AcceptedStatuses: []int{401},
|
||||||
|
BodyContains: "unauthorized",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "longhorn-auth",
|
||||||
|
URL: "https://longhorn.bstein.dev/",
|
||||||
|
AcceptedStatuses: []int{302},
|
||||||
|
BodyContains: "openid-connect/auth",
|
||||||
|
TimeoutSeconds: 12,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
RequireFluxHealth: true,
|
RequireFluxHealth: true,
|
||||||
FluxHealthWaitSeconds: 900,
|
FluxHealthWaitSeconds: 900,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user