recovery: tolerate transient startup soak checks

This commit is contained in:
codex 2026-06-18 23:33:11 -03:00
parent 3e337043d5
commit c8ccc970e6

View File

@ -341,8 +341,14 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
if poll <= 0 { if poll <= 0 {
poll = 5 * time.Second poll = 5 * time.Second
} }
deadline := time.Now().Add(window) maxWait := window * 3
if maxWait < window+poll {
maxWait = window + poll
}
deadline := time.Now().Add(maxWait)
stableSince := time.Time{}
lastStatus := time.Time{} lastStatus := time.Time{}
lastFailure := ""
lastRecycleAttempt := time.Time{} lastRecycleAttempt := time.Time{}
lastReplicaHeal := time.Time{} lastReplicaHeal := time.Time{}
@ -350,20 +356,37 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
if err := o.startupStabilityHealthy(ctx); err != nil { if err := o.startupStabilityHealthy(ctx); err != nil {
return fmt.Errorf("startup stability window failed: %w", err) lastFailure = err.Error()
} stableSince = time.Time{}
if time.Now().After(deadline) { if time.Now().After(deadline) {
o.log.Printf("startup stability window passed (%s)", window) return fmt.Errorf("startup stability window failed after %s: %s", maxWait, lastFailure)
return nil
} }
if time.Since(lastStatus) >= 30*time.Second { if time.Since(lastStatus) >= 30*time.Second {
remaining := time.Until(deadline).Round(time.Second) remaining := time.Until(deadline).Round(time.Second)
if remaining < 0 { if remaining < 0 {
remaining = 0 remaining = 0
} }
o.log.Printf("startup stability soak reset (%s remaining): %s", remaining, lastFailure)
lastStatus = time.Now()
}
} else {
if stableSince.IsZero() {
stableSince = time.Now()
}
stableFor := time.Since(stableSince)
if stableFor >= window {
o.log.Printf("startup stability window passed (%s)", window)
return nil
}
if time.Since(lastStatus) >= 30*time.Second {
remaining := (window - stableFor).Round(time.Second)
if remaining < 0 {
remaining = 0
}
o.log.Printf("startup stability soak in progress (%s remaining)", remaining) o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
lastStatus = time.Now() lastStatus = time.Now()
} }
}
select { select {
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()