recovery: avoid repeated manual cordon alerts

This commit is contained in:
codex 2026-06-19 15:59:00 -03:00
parent 57610c623a
commit 83d987f43a
2 changed files with 51 additions and 4 deletions

View File

@ -144,9 +144,11 @@ func (o *Orchestrator) enforceRecoveryCordonLeases(ctx context.Context) (int, er
cordonTime := nodeUnschedulableSince(node) cordonTime := nodeUnschedulableSince(node)
if !cordonTime.IsZero() && now.Sub(cordonTime) > maxAge { if !cordonTime.IsZero() && now.Sub(cordonTime) > maxAge {
message := fmt.Sprintf("manual action required: node %s has an unowned cordon older than %s; Ananke will not silently leave resources stranded or override a non-Ananke cordon", name, maxAge) message := fmt.Sprintf("manual action required: node %s has an unowned cordon older than %s; Ananke will not silently leave resources stranded or override a non-Ananke cordon", name, maxAge)
if !manualActionAlreadyMarked(ann, message) {
o.markManualActionRequired(ctx, name, message) o.markManualActionRequired(ctx, name, message)
errs = append(errs, message) errs = append(errs, message)
} }
}
continue continue
} }
@ -161,8 +163,10 @@ func (o *Orchestrator) enforceRecoveryCordonLeases(ctx context.Context) (int, er
expired, deadlineText := cordonLeaseExpired(ann, now) expired, deadlineText := cordonLeaseExpired(ann, now)
if expired { if expired {
message := fmt.Sprintf("manual action required: node %s remains cordoned after Ananke repair lease expired at %s: %v", name, deadlineText, recoverErr) message := fmt.Sprintf("manual action required: node %s remains cordoned after Ananke repair lease expired at %s: %v", name, deadlineText, recoverErr)
if !manualActionAlreadyMarked(ann, message) {
o.markManualActionRequired(ctx, name, message) o.markManualActionRequired(ctx, name, message)
errs = append(errs, message) errs = append(errs, message)
}
} else { } else {
o.log.Printf("leased cordon on %s still pending repair before %s: %v", name, deadlineText, recoverErr) o.log.Printf("leased cordon on %s still pending repair before %s: %v", name, deadlineText, recoverErr)
} }
@ -286,6 +290,17 @@ func (o *Orchestrator) markManualActionRequired(ctx context.Context, node string
} }
} }
// manualActionAlreadyMarked reports whether a node already carries this action.
// Signature: manualActionAlreadyMarked(annotations map[string]string, message string) bool.
// Why: repeated recovery scans should not keep rewriting the same node annotation
// or turning an already-visible manual task into fresh log noise.
func manualActionAlreadyMarked(annotations map[string]string, message string) bool {
if annotations == nil {
return false
}
return annotations[anankeCordonManualActionAnnotation] == sanitizeCordonAnnotationValue(message)
}
// recoveryCordonMaxDuration returns the maximum allowed automatic cordon lease. // recoveryCordonMaxDuration returns the maximum allowed automatic cordon lease.
// Signature: (o *Orchestrator) recoveryCordonMaxDuration() time.Duration. // Signature: (o *Orchestrator) recoveryCordonMaxDuration() time.Duration.
// Why: all recovery cordon decisions should share the same operator promise. // Why: all recovery cordon decisions should share the same operator promise.

View File

@ -90,6 +90,38 @@ func TestEnforceRecoveryCordonLeasesReportsStaleUnownedCordon(t *testing.T) {
} }
} }
// TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction runs one orchestration or CLI step.
// Signature: TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction(t *testing.T).
// Why: once a manual action is visible on the node, later scans should not keep
// rewriting the same annotation or producing fresh alert noise.
func TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction(t *testing.T) {
old := time.Now().UTC().Add(-2 * time.Hour).Format(time.RFC3339)
message := "manual action required: node titan-18 has an unowned cordon older than 1h0m0s; Ananke will not silently leave resources stranded or override a non-Ananke cordon"
nodeJSON := `{"items":[{"metadata":{"name":"titan-18","annotations":{"` +
anankeCordonManualActionAnnotation + `":"` + message + `","` +
anankeCordonManualAtAnnotation + `":"` + old + `"}},"spec":{"unschedulable":true,"taints":[{"key":"node.kubernetes.io/unschedulable","timeAdded":"` +
old + `"}]},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{RecoveryCordonMaxSeconds: 3600},
}, []commandStub{
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: nodeJSON},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "annotate", "node", "titan-18")(name, args) {
return false
}
t.Fatalf("manual-action annotation should not be rewritten")
return true
},
},
})
released, err := orch.enforceRecoveryCordonLeases(context.Background())
if released != 0 || err != nil {
t.Fatalf("expected quiet existing manual action, released=%d err=%v", released, err)
}
}
// TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair runs one orchestration or CLI step. // TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair runs one orchestration or CLI step.
// Signature: TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair(t *testing.T). // Signature: TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair(t *testing.T).
// Why: after the lease expires, a failed automatic repair must become a clear // Why: after the lease expires, a failed automatic repair must become a clear