diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 05ff6ed..1aafade 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -286,14 +286,15 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t * // TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step. // Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T). // Why: post-outage controller pods can remain Unknown or Failed after their -// node recovers; normal deletion clears stale status without force-deleting or -// touching storage. +// node recovers; deletion clears stale status while force deletion stays away +// from PVC-backed storage. func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) { old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339) pods := `{"items":[` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` + + `{"metadata":{"namespace":"logging","name":"oauth2-proxy-terminating","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy-logs"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"secret"}]},"status":{"phase":"Running"}},` + `{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` + @@ -325,6 +326,15 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) { return true }, }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "-n", "logging", "delete", "pod", "oauth2-proxy-terminating", "--wait=false", "--grace-period=0", "--force")(name, args) { + return false + } + deleted = append(deleted, "oauth2-proxy-terminating") + return true + }, + }, { match: func(name string, args []string) bool { if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) { @@ -342,7 +352,7 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) { if err := orch.recycleStuckControllerPods(context.Background()); err != nil { t.Fatalf("recycleStuckControllerPods failed: %v", err) } - if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,pvc-backed-failed" { + if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,oauth2-proxy-terminating,pvc-backed-failed" { t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted) } } diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index c24cbee..d6943f8 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -224,6 +224,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { if reason == "" { reason = stalePhaseReasons[ns+"/"+name] } + if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) { + reason = "StaleDeletingControllerPod" + } if reason == "" { continue }