recovery: force clear safe deleting pods
This commit is contained in:
parent
9031e09f4e
commit
707458cfc5
@ -286,14 +286,15 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *
|
|||||||
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
|
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
|
||||||
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
|
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
|
||||||
// Why: post-outage controller pods can remain Unknown or Failed after their
|
// Why: post-outage controller pods can remain Unknown or Failed after their
|
||||||
// node recovers; normal deletion clears stale status without force-deleting or
|
// node recovers; deletion clears stale status while force deletion stays away
|
||||||
// touching storage.
|
// from PVC-backed storage.
|
||||||
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
||||||
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
|
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
|
||||||
pods := `{"items":[` +
|
pods := `{"items":[` +
|
||||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
||||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` +
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"secret"}]},"status":{"phase":"Failed"}},` +
|
||||||
|
`{"metadata":{"namespace":"logging","name":"oauth2-proxy-terminating","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy-logs"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"secret"}]},"status":{"phase":"Running"}},` +
|
||||||
`{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` +
|
`{"metadata":{"namespace":"longhorn-system","name":"pvc-backed-failed","creationTimestamp":"` + old + `","deletionTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"pvc-backed"}]},"spec":{"nodeName":"titan-12","volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"data"}}]},"status":{"phase":"Failed"}},` +
|
||||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
||||||
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
|
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
|
||||||
@ -325,6 +326,15 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
|||||||
return true
|
return true
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
match: func(name string, args []string) bool {
|
||||||
|
if !matchContains("kubectl", "-n", "logging", "delete", "pod", "oauth2-proxy-terminating", "--wait=false", "--grace-period=0", "--force")(name, args) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
deleted = append(deleted, "oauth2-proxy-terminating")
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
match: func(name string, args []string) bool {
|
match: func(name string, args []string) bool {
|
||||||
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) {
|
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "pvc-backed-failed", "--wait=false")(name, args) {
|
||||||
@ -342,7 +352,7 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
|||||||
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
||||||
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
||||||
}
|
}
|
||||||
if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,pvc-backed-failed" {
|
if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed,oauth2-proxy-terminating,pvc-backed-failed" {
|
||||||
t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted)
|
t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -224,6 +224,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
if reason == "" {
|
if reason == "" {
|
||||||
reason = stalePhaseReasons[ns+"/"+name]
|
reason = stalePhaseReasons[ns+"/"+name]
|
||||||
}
|
}
|
||||||
|
if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) {
|
||||||
|
reason = "StaleDeletingControllerPod"
|
||||||
|
}
|
||||||
if reason == "" {
|
if reason == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user