diff --git a/internal/cluster/orchestrator_cordon_lease.go b/internal/cluster/orchestrator_cordon_lease.go index f01afab..8ba8a5f 100644 --- a/internal/cluster/orchestrator_cordon_lease.go +++ b/internal/cluster/orchestrator_cordon_lease.go @@ -144,8 +144,10 @@ func (o *Orchestrator) enforceRecoveryCordonLeases(ctx context.Context) (int, er cordonTime := nodeUnschedulableSince(node) if !cordonTime.IsZero() && now.Sub(cordonTime) > maxAge { message := fmt.Sprintf("manual action required: node %s has an unowned cordon older than %s; Ananke will not silently leave resources stranded or override a non-Ananke cordon", name, maxAge) - o.markManualActionRequired(ctx, name, message) - errs = append(errs, message) + if !manualActionAlreadyMarked(ann, message) { + o.markManualActionRequired(ctx, name, message) + errs = append(errs, message) + } } continue } @@ -161,8 +163,10 @@ func (o *Orchestrator) enforceRecoveryCordonLeases(ctx context.Context) (int, er expired, deadlineText := cordonLeaseExpired(ann, now) if expired { message := fmt.Sprintf("manual action required: node %s remains cordoned after Ananke repair lease expired at %s: %v", name, deadlineText, recoverErr) - o.markManualActionRequired(ctx, name, message) - errs = append(errs, message) + if !manualActionAlreadyMarked(ann, message) { + o.markManualActionRequired(ctx, name, message) + errs = append(errs, message) + } } else { o.log.Printf("leased cordon on %s still pending repair before %s: %v", name, deadlineText, recoverErr) } @@ -286,6 +290,17 @@ func (o *Orchestrator) markManualActionRequired(ctx context.Context, node string } } +// manualActionAlreadyMarked reports whether a node already carries this action. +// Signature: manualActionAlreadyMarked(annotations map[string]string, message string) bool. +// Why: repeated recovery scans should not keep rewriting the same node annotation +// or turning an already-visible manual task into fresh log noise. +func manualActionAlreadyMarked(annotations map[string]string, message string) bool { + if annotations == nil { + return false + } + return annotations[anankeCordonManualActionAnnotation] == sanitizeCordonAnnotationValue(message) +} + // recoveryCordonMaxDuration returns the maximum allowed automatic cordon lease. // Signature: (o *Orchestrator) recoveryCordonMaxDuration() time.Duration. // Why: all recovery cordon decisions should share the same operator promise. diff --git a/internal/cluster/orchestrator_cordon_lease_test.go b/internal/cluster/orchestrator_cordon_lease_test.go index 81d2795..51a966e 100644 --- a/internal/cluster/orchestrator_cordon_lease_test.go +++ b/internal/cluster/orchestrator_cordon_lease_test.go @@ -90,6 +90,38 @@ func TestEnforceRecoveryCordonLeasesReportsStaleUnownedCordon(t *testing.T) { } } +// TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction runs one orchestration or CLI step. +// Signature: TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction(t *testing.T). +// Why: once a manual action is visible on the node, later scans should not keep +// rewriting the same annotation or producing fresh alert noise. +func TestEnforceRecoveryCordonLeasesDoesNotRepeatExistingManualAction(t *testing.T) { + old := time.Now().UTC().Add(-2 * time.Hour).Format(time.RFC3339) + message := "manual action required: node titan-18 has an unowned cordon older than 1h0m0s; Ananke will not silently leave resources stranded or override a non-Ananke cordon" + nodeJSON := `{"items":[{"metadata":{"name":"titan-18","annotations":{"` + + anankeCordonManualActionAnnotation + `":"` + message + `","` + + anankeCordonManualAtAnnotation + `":"` + old + `"}},"spec":{"unschedulable":true,"taints":[{"key":"node.kubernetes.io/unschedulable","timeAdded":"` + + old + `"}]},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}` + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{RecoveryCordonMaxSeconds: 3600}, + }, []commandStub{ + {match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: nodeJSON}, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "annotate", "node", "titan-18")(name, args) { + return false + } + t.Fatalf("manual-action annotation should not be rewritten") + return true + }, + }, + }) + + released, err := orch.enforceRecoveryCordonLeases(context.Background()) + if released != 0 || err != nil { + t.Fatalf("expected quiet existing manual action, released=%d err=%v", released, err) + } +} + // TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair runs one orchestration or CLI step. // Signature: TestEnforceRecoveryCordonLeasesEscalatesExpiredCryptsetupRepair(t *testing.T). // Why: after the lease expires, a failed automatic repair must become a clear