recovery: avoid repeated manual cordon alerts
This commit is contained in:
parent
57610c623a
commit
83d987f43a
@ -144,8 +144,10 @@ func (o *Orchestrator) enforceRecoveryCordonLeases(ctx context.Context) (int, er
|
||||
cordonTime := nodeUnschedulableSince(node)
|
||||
if !cordonTime.IsZero() && now.Sub(cordonTime) > maxAge {
|
||||
message := fmt.Sprintf("manual action required: node %s has an unowned cordon older than %s; Ananke will not silently leave resources stranded or override a non-Ananke cordon", name, maxAge)
|
||||
o.markManualActionRequired(ctx, name, message)
|
||||
errs = append(errs, message)
|
||||
if !manualActionAlreadyMarked(ann, message) {
|
||||
o.markManualActionRequired(ctx, name, message)
|
||||
errs = append(errs, message)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
@ -161,8 +163,10 @@ func (o *Orchestrator) enforceRecoveryCordonLeases(ctx context.Context) (int, er
|
||||
expired, deadlineText := cordonLeaseExpired(ann, now)
|
||||
if expired {
|
||||
message := fmt.Sprintf("manual action required: node %s remains cordoned after Ananke repair lease expired at %s: %v", name, deadlineText, recoverErr)
|
||||
o.markManualActionRequired(ctx, name, message)
|
||||
errs = append(errs, message)
|
||||
if !manualActionAlreadyMarked(ann, message) {
|
||||
o.markManualActionRequired(ctx, name, message)
|
||||
errs = append(errs, message)
|
||||
}
|
||||
} else {
|
||||
o.log.Printf("leased cordon on %s still pending repair before %s: %v", name, deadlineText, recoverErr)
|
||||
}
|
||||
@ -286,6 +290,17 @@ func (o *Orchestrator) markManualActionRequired(ctx context.Context, node string
|
||||
}
|
||||
}
|
||||
|
||||
// manualActionAlreadyMarked reports whether a node already carries this action.
|
||||
// Signature: manualActionAlreadyMarked(annotations map[string]string, message string) bool.
|
||||
// Why: repeated recovery scans should not keep rewriting the same node annotation
|
||||
// or turning an already-visible manual task into fresh log noise.
|
||||
func manualActionAlreadyMarked(annotations map[string]string, message string) bool {
|
||||
if annotations == nil {
|
||||
return false
|
||||
}
|
||||
return annotations[anankeCordonManualActionAnnotation] == sanitizeCordonAnnotationValue(message)
|
||||
}
|
||||
|
||||
// recoveryCordonMaxDuration returns the maximum allowed automatic cordon lease.
|
||||
// Signature: (o *Orchestrator) recoveryCordonMaxDuration() time.Duration.
|
||||
// Why: all recovery cordon decisions should share the same operator promise.
|
||||
|
||||
@ -90,6 +90,38 @@ func TestEnforceRecoveryCordonLeasesReportsStaleUnownedCordon(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction runs one orchestration or CLI step.
|
||||
// Signature: TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction(t *testing.T).
|
||||
// Why: once a manual action is visible on the node, later scans should not keep
|
||||
// rewriting the same annotation or producing fresh alert noise.
|
||||
func TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction(t *testing.T) {
|
||||
old := time.Now().UTC().Add(-2 * time.Hour).Format(time.RFC3339)
|
||||
message := "manual action required: node titan-18 has an unowned cordon older than 1h0m0s; Ananke will not silently leave resources stranded or override a non-Ananke cordon"
|
||||
nodeJSON := `{"items":[{"metadata":{"name":"titan-18","annotations":{"` +
|
||||
anankeCordonManualActionAnnotation + `":"` + message + `","` +
|
||||
anankeCordonManualAtAnnotation + `":"` + old + `"}},"spec":{"unschedulable":true,"taints":[{"key":"node.kubernetes.io/unschedulable","timeAdded":"` +
|
||||
old + `"}]},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{RecoveryCordonMaxSeconds: 3600},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: nodeJSON},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "annotate", "node", "titan-18")(name, args) {
|
||||
return false
|
||||
}
|
||||
t.Fatalf("manual-action annotation should not be rewritten")
|
||||
return true
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
released, err := orch.enforceRecoveryCordonLeases(context.Background())
|
||||
if released != 0 || err != nil {
|
||||
t.Fatalf("expected quiet existing manual action, released=%d err=%v", released, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair runs one orchestration or CLI step.
|
||||
// Signature: TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair(t *testing.T).
|
||||
// Why: after the lease expires, a failed automatic repair must become a clear
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user