From 83723d03589abd1447aa3f0c59dbbde4b9f8eb38 Mon Sep 17 00:00:00 2001 From: codex Date: Thu, 18 Jun 2026 22:55:44 -0300 Subject: [PATCH] recovery: clean failed stale controller pods --- .../orchestrator_unit_additional_test.go | 25 ++++++++++++----- .../orchestrator_workload_convergence.go | 27 ++++++++++--------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 070a9ef..1a20865 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -283,15 +283,17 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t * } } -// TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes runs one orchestration or CLI step. -// Signature: TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T). -// Why: post-outage controller pods can remain Unknown after their node recovers; -// normal deletion clears stale status without force-deleting or touching storage. -func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T) { +// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step. +// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T). +// Why: post-outage controller pods can remain Unknown or Failed after their +// node recovers; normal deletion clears stale status without force-deleting or +// touching storage. +func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) { old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339) pods := `{"items":[` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` + + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Failed"}},` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"default","name":"bare-pod","creationTimestamp":"` + old + `"},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}}]}` @@ -313,13 +315,22 @@ func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T) return true }, }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-failed", "--wait=false")(name, args) { + return false + } + deleted = append(deleted, "longhorn-vault-sync-failed") + return true + }, + }, }) if err := orch.recycleStuckControllerPods(context.Background()); err != nil { t.Fatalf("recycleStuckControllerPods failed: %v", err) } - if len(deleted) != 1 || deleted[0] != "longhorn-vault-sync-old" { - t.Fatalf("expected only old Unknown controller pod on Ready node to be recycled, got %#v", deleted) + if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed" { + t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted) } } diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index 6b6b9a4..bef7faf 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -182,11 +182,11 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { } else { encryptedMountReasons = reasons } - unknownPhaseReasons := map[string]string{} - if reasons, scanErr := o.unknownControllerPodReasons(ctx, list, grace); scanErr != nil { - o.log.Printf("warning: unknown controller pod scan failed: %v", scanErr) + stalePhaseReasons := map[string]string{} + if reasons, scanErr := o.staleControllerPodReasons(ctx, list, grace); scanErr != nil { + o.log.Printf("warning: stale controller pod scan failed: %v", scanErr) } else { - unknownPhaseReasons = reasons + stalePhaseReasons = reasons } recycled := []string{} for _, pod := range list.Items { @@ -222,7 +222,7 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { reason = encryptedMountReasons[ns+"/"+name] } if reason == "" { - reason = unknownPhaseReasons[ns+"/"+name] + reason = stalePhaseReasons[ns+"/"+name] } if reason == "" { continue @@ -242,13 +242,13 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { return nil } -// unknownControllerPodReasons runs one orchestration or CLI step. -// Signature: (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). +// staleControllerPodReasons runs one orchestration or CLI step. +// Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Why: after node or kubelet recovery, controller-owned pods can stay in -// Unknown even though the node is Ready and a replacement may already be -// healthy. A normal pod delete lets Kubernetes clean the stale status without -// touching storage objects or forcing deletion on a partitioned node. -func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) { +// terminal or unknown status even though the node is Ready and a replacement may +// already be healthy. A normal pod delete lets Kubernetes clean the stale status +// without touching storage objects or forcing deletion on a partitioned node. +func (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) { unavailable, err := o.unavailableNodeSet(ctx) if err != nil { return nil, err @@ -261,7 +261,8 @@ func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods pod if ns == "" || name == "" || node == "" { continue } - if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Unknown") { + phase := strings.TrimSpace(pod.Status.Phase) + if !strings.EqualFold(phase, "Unknown") && !strings.EqualFold(phase, "Failed") { continue } if _, badNode := unavailable[node]; badNode { @@ -273,7 +274,7 @@ func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods pod if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace { continue } - reasons[ns+"/"+name] = "UnknownControllerPodOnReadyNode:" + node + reasons[ns+"/"+name] = "StaleControllerPodOnReadyNode:" + node + ":" + phase } return reasons, nil }