recovery: tolerate transient startup soak checks

This commit is contained in:
codex 2026-06-18 23:33:11 -03:00
parent 3e337043d5
commit c8ccc970e6

View File

@ -341,8 +341,14 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
if poll <= 0 {
poll = 5 * time.Second
}
deadline := time.Now().Add(window)
maxWait := window * 3
if maxWait < window+poll {
maxWait = window + poll
}
deadline := time.Now().Add(maxWait)
stableSince := time.Time{}
lastStatus := time.Time{}
lastFailure := ""
lastRecycleAttempt := time.Time{}
lastReplicaHeal := time.Time{}
@ -350,20 +356,37 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
if err := o.startupStabilityHealthy(ctx); err != nil {
return fmt.Errorf("startup stability window failed: %w", err)
}
lastFailure = err.Error()
stableSince = time.Time{}
if time.Now().After(deadline) {
o.log.Printf("startup stability window passed (%s)", window)
return nil
return fmt.Errorf("startup stability window failed after %s: %s", maxWait, lastFailure)
}
if time.Since(lastStatus) >= 30*time.Second {
remaining := time.Until(deadline).Round(time.Second)
if remaining < 0 {
remaining = 0
}
o.log.Printf("startup stability soak reset (%s remaining): %s", remaining, lastFailure)
lastStatus = time.Now()
}
} else {
if stableSince.IsZero() {
stableSince = time.Now()
}
stableFor := time.Since(stableSince)
if stableFor >= window {
o.log.Printf("startup stability window passed (%s)", window)
return nil
}
if time.Since(lastStatus) >= 30*time.Second {
remaining := (window - stableFor).Round(time.Second)
if remaining < 0 {
remaining = 0
}
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
lastStatus = time.Now()
}
}
select {
case <-ctx.Done():
return ctx.Err()