recovery: tolerate transient startup soak checks
This commit is contained in:
parent
3e337043d5
commit
c8ccc970e6
@ -341,8 +341,14 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
|||||||
if poll <= 0 {
|
if poll <= 0 {
|
||||||
poll = 5 * time.Second
|
poll = 5 * time.Second
|
||||||
}
|
}
|
||||||
deadline := time.Now().Add(window)
|
maxWait := window * 3
|
||||||
|
if maxWait < window+poll {
|
||||||
|
maxWait = window + poll
|
||||||
|
}
|
||||||
|
deadline := time.Now().Add(maxWait)
|
||||||
|
stableSince := time.Time{}
|
||||||
lastStatus := time.Time{}
|
lastStatus := time.Time{}
|
||||||
|
lastFailure := ""
|
||||||
lastRecycleAttempt := time.Time{}
|
lastRecycleAttempt := time.Time{}
|
||||||
lastReplicaHeal := time.Time{}
|
lastReplicaHeal := time.Time{}
|
||||||
|
|
||||||
@ -350,20 +356,37 @@ func (o *Orchestrator) waitForStabilityWindow(ctx context.Context) error {
|
|||||||
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
o.maybeAutoRecycleStuckPods(ctx, &lastRecycleAttempt)
|
||||||
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
o.maybeAutoHealCriticalWorkloadReplicas(ctx, &lastReplicaHeal)
|
||||||
if err := o.startupStabilityHealthy(ctx); err != nil {
|
if err := o.startupStabilityHealthy(ctx); err != nil {
|
||||||
return fmt.Errorf("startup stability window failed: %w", err)
|
lastFailure = err.Error()
|
||||||
}
|
stableSince = time.Time{}
|
||||||
if time.Now().After(deadline) {
|
if time.Now().After(deadline) {
|
||||||
o.log.Printf("startup stability window passed (%s)", window)
|
return fmt.Errorf("startup stability window failed after %s: %s", maxWait, lastFailure)
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
if time.Since(lastStatus) >= 30*time.Second {
|
if time.Since(lastStatus) >= 30*time.Second {
|
||||||
remaining := time.Until(deadline).Round(time.Second)
|
remaining := time.Until(deadline).Round(time.Second)
|
||||||
if remaining < 0 {
|
if remaining < 0 {
|
||||||
remaining = 0
|
remaining = 0
|
||||||
}
|
}
|
||||||
|
o.log.Printf("startup stability soak reset (%s remaining): %s", remaining, lastFailure)
|
||||||
|
lastStatus = time.Now()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if stableSince.IsZero() {
|
||||||
|
stableSince = time.Now()
|
||||||
|
}
|
||||||
|
stableFor := time.Since(stableSince)
|
||||||
|
if stableFor >= window {
|
||||||
|
o.log.Printf("startup stability window passed (%s)", window)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if time.Since(lastStatus) >= 30*time.Second {
|
||||||
|
remaining := (window - stableFor).Round(time.Second)
|
||||||
|
if remaining < 0 {
|
||||||
|
remaining = 0
|
||||||
|
}
|
||||||
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
|
o.log.Printf("startup stability soak in progress (%s remaining)", remaining)
|
||||||
lastStatus = time.Now()
|
lastStatus = time.Now()
|
||||||
}
|
}
|
||||||
|
}
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return ctx.Err()
|
return ctx.Err()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user