diff --git a/internal/cluster/orchestrator_service_stability.go b/internal/cluster/orchestrator_service_stability.go index cc5d017..9102d9d 100644 --- a/internal/cluster/orchestrator_service_stability.go +++ b/internal/cluster/orchestrator_service_stability.go @@ -341,8 +341,14 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error { if poll <= 0 { poll = 5 * time.Second } - deadline := time.Now().Add(window) + maxWait := window * 3 + if maxWait < window+poll { + maxWait = window + poll + } + deadline := time.Now().Add(maxWait) + stableSince := time.Time{} lastStatus := time.Time{} + lastFailure := "" lastRecycleAttempt := time.Time{} lastReplicaHeal := time.Time{} @@ -350,19 +356,36 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error { o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt) o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal) if err := o.startupStabilityHealthy(ctx); err != nil { - return fmt.Errorf("startup stability window failed: %w", err) - } - if time.Now().After(deadline) { - o.log.Printf("startup stability window passed (%s)", window) - return nil - } - if time.Since(lastStatus) >= 30*time.Second { - remaining := time.Until(deadline).Round(time.Second) - if remaining < 0 { - remaining = 0 + lastFailure = err.Error() + stableSince = time.Time{} + if time.Now().After(deadline) { + return fmt.Errorf("startup stability window failed after %s: %s", maxWait, lastFailure) + } + if time.Since(lastStatus) >= 30*time.Second { + remaining := time.Until(deadline).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("startup stability soak reset (%s remaining): %s", remaining, lastFailure) + lastStatus = time.Now() + } + } else { + if stableSince.IsZero() { + stableSince = time.Now() + } + stableFor := time.Since(stableSince) + if stableFor >= window { + o.log.Printf("startup stability window passed (%s)", window) + return nil + } + if time.Since(lastStatus) >= 30*time.Second { + remaining := (window - stableFor).Round(time.Second) + if remaining < 0 { + remaining = 0 + } + o.log.Printf("startup stability soak in progress (%s remaining)", remaining) + lastStatus = time.Now() } - o.log.Printf("startup stability soak in progress (%s remaining)", remaining) - lastStatus = time.Now() } select { case <-ctx.Done():