recovery: tolerate transient startup soak checks
This commit is contained in:
parent
3e337043d5
commit
c8ccc970e6
@ -341,8 +341,14 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
||||
if poll <= 0 {
|
||||
poll = 5 * time.Second
|
||||
}
|
||||
deadline := time.Now().Add(window)
|
||||
maxWait := window * 3
|
||||
if maxWait < window+poll {
|
||||
maxWait = window + poll
|
||||
}
|
||||
deadline := time.Now().Add(maxWait)
|
||||
stableSince := time.Time{}
|
||||
lastStatus := time.Time{}
|
||||
lastFailure := ""
|
||||
lastRecycleAttempt := time.Time{}
|
||||
lastReplicaHeal := time.Time{}
|
||||
|
||||
@ -350,19 +356,36 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
||||
if err := o.startupStabilityHealthy(ctx); err != nil {
|
||||
return fmt.Errorf("startup stability window failed: %w", err)
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
o.log.Printf("startup stability window passed (%s)", window)
|
||||
return nil
|
||||
}
|
||||
if time.Since(lastStatus) >= 30*time.Second {
|
||||
remaining := time.Until(deadline).Round(time.Second)
|
||||
if remaining < 0 {
|
||||
remaining = 0
|
||||
lastFailure = err.Error()
|
||||
stableSince = time.Time{}
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("startup stability window failed after %s: %s", maxWait, lastFailure)
|
||||
}
|
||||
if time.Since(lastStatus) >= 30*time.Second {
|
||||
remaining := time.Until(deadline).Round(time.Second)
|
||||
if remaining < 0 {
|
||||
remaining = 0
|
||||
}
|
||||
o.log.Printf("startup stability soak reset (%s remaining): %s", remaining, lastFailure)
|
||||
lastStatus = time.Now()
|
||||
}
|
||||
} else {
|
||||
if stableSince.IsZero() {
|
||||
stableSince = time.Now()
|
||||
}
|
||||
stableFor := time.Since(stableSince)
|
||||
if stableFor >= window {
|
||||
o.log.Printf("startup stability window passed (%s)", window)
|
||||
return nil
|
||||
}
|
||||
if time.Since(lastStatus) >= 30*time.Second {
|
||||
remaining := (window - stableFor).Round(time.Second)
|
||||
if remaining < 0 {
|
||||
remaining = 0
|
||||
}
|
||||
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
|
||||
lastStatus = time.Now()
|
||||
}
|
||||
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
|
||||
lastStatus = time.Now()
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user