recovery: force clear safe deleting pods

This commit is contained in:
codex 2026-06-19 04:15:59 -03:00
parent 9031e09f4e
commit 707458cfc5
2 changed files with 16 additions and 3 deletions

View File

@ -286,14 +286,15 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step. // TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T). // Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
// Why: post-outage controller pods can remain Unknown or Failed after their // Why: post-outage controller pods can remain Unknown or Failed after their
// node recovers; normal deletion clears stale status without force-deleting or // node recovers; deletion clears stale status while force deletion stays away
// touching storage. // from PVC-backed storage.
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) { func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339) recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
pods := `{"items":[` + pods := `{"items":[` +
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` +
`{"metadata":{"namespace":"logging","name":"oauth2-proxy-terminating","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy-logs"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"secret"}]},"status":{"phase":"Running"}},` +
`{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` + `{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` +
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` + `{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
@ -325,6 +326,15 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
return true return true
}, },
}, },
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "logging", "delete", "pod", "oauth2-proxy-terminating", "--wait=false", "--grace-period=0", "--force")(name, args) {
return false
}
deleted = append(deleted, "oauth2-proxy-terminating")
return true
},
},
{ {
match: func(name string, args []string) bool { match: func(name string, args []string) bool {
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) { if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) {
@ -342,7 +352,7 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
if err := orch.recycleStuckControllerPods(context.Background()); err != nil { if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err) t.Fatalf("recycleStuckControllerPods failed: %v", err)
} }
if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,pvc-backed-failed" { if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,oauth2-proxy-terminating,pvc-backed-failed" {
t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted) t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted)
} }
} }

View File

@ -224,6 +224,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
if reason == "" { if reason == "" {
reason = stalePhaseReasons[ns+"/"+name] reason = stalePhaseReasons[ns+"/"+name]
} }
if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) {
reason = "StaleDeletingControllerPod"
}
if reason == "" { if reason == "" {
continue continue
} }