recovery: clean failed stale controller pods
This commit is contained in:
parent
7f3a9c1428
commit
83723d0358
@ -283,15 +283,17 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes runs one orchestration or CLI step.
|
// TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes runs one orchestration or CLI step.
|
||||||
// Signature: TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T).
|
// Signature: TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T).
|
||||||
// Why: post-outage controller pods can remain Unknown after their node recovers;
|
// Why: post-outage controller pods can remain Unknown or Failed after their
|
||||||
// normal deletion clears stale status without force-deleting or touching storage.
|
// node recovers; normal deletion clears stale status without force-deleting or
|
||||||
func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T) {
|
// touching storage.
|
||||||
|
func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
||||||
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
|
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
|
||||||
pods := `{"items":[` +
|
pods := `{"items":[` +
|
||||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
||||||
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-failed","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Failed"}},` +
|
||||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
||||||
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
|
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
|
||||||
`{"metadata":{"namespace":"default","name":"bare-pod","creationTimestamp":"` + old + `"},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}}]}`
|
`{"metadata":{"namespace":"default","name":"bare-pod","creationTimestamp":"` + old + `"},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}}]}`
|
||||||
@ -313,13 +315,22 @@ func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T)
|
|||||||
return true
|
return true
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
match: func(name string, args []string) bool {
|
||||||
|
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-failed", "--wait=false")(name, args) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
deleted = append(deleted, "longhorn-vault-sync-failed")
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
||||||
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
||||||
}
|
}
|
||||||
if len(deleted) != 1 || deleted[0] != "longhorn-vault-sync-old" {
|
if strings.Join(deleted, ",") != "longhorn-vault-sync-old,longhorn-vault-sync-failed" {
|
||||||
t.Fatalf("expected only old Unknown controller pod on Ready node to be recycled, got %#v", deleted)
|
t.Fatalf("expected only stale controller pods on Ready node to be recycled, got %#v", deleted)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -182,11 +182,11 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
} else {
|
} else {
|
||||||
encryptedMountReasons = reasons
|
encryptedMountReasons = reasons
|
||||||
}
|
}
|
||||||
unknownPhaseReasons := map[string]string{}
|
stalePhaseReasons := map[string]string{}
|
||||||
if reasons, scanErr := o.unknownControllerPodReasons(ctx, list, grace); scanErr != nil {
|
if reasons, scanErr := o.staleControllerPodReasons(ctx, list, grace); scanErr != nil {
|
||||||
o.log.Printf("warning: unknown controller pod scan failed: %v", scanErr)
|
o.log.Printf("warning: stale controller pod scan failed: %v", scanErr)
|
||||||
} else {
|
} else {
|
||||||
unknownPhaseReasons = reasons
|
stalePhaseReasons = reasons
|
||||||
}
|
}
|
||||||
recycled := []string{}
|
recycled := []string{}
|
||||||
for _, pod := range list.Items {
|
for _, pod := range list.Items {
|
||||||
@ -222,7 +222,7 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
reason = encryptedMountReasons[ns+"/"+name]
|
reason = encryptedMountReasons[ns+"/"+name]
|
||||||
}
|
}
|
||||||
if reason == "" {
|
if reason == "" {
|
||||||
reason = unknownPhaseReasons[ns+"/"+name]
|
reason = stalePhaseReasons[ns+"/"+name]
|
||||||
}
|
}
|
||||||
if reason == "" {
|
if reason == "" {
|
||||||
continue
|
continue
|
||||||
@ -242,13 +242,13 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// unknownControllerPodReasons runs one orchestration or CLI step.
|
// staleControllerPodReasons runs one orchestration or CLI step.
|
||||||
// Signature: (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
// Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||||
// Why: after node or kubelet recovery, controller-owned pods can stay in
|
// Why: after node or kubelet recovery, controller-owned pods can stay in
|
||||||
// Unknown even though the node is Ready and a replacement may already be
|
// terminal or unknown status even though the node is Ready and a replacement may
|
||||||
// healthy. A normal pod delete lets Kubernetes clean the stale status without
|
// already be healthy. A normal pod delete lets Kubernetes clean the stale status
|
||||||
// touching storage objects or forcing deletion on a partitioned node.
|
// without touching storage objects or forcing deletion on a partitioned node.
|
||||||
func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
|
func (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
|
||||||
unavailable, err := o.unavailableNodeSet(ctx)
|
unavailable, err := o.unavailableNodeSet(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@ -261,7 +261,8 @@ func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods pod
|
|||||||
if ns == "" || name == "" || node == "" {
|
if ns == "" || name == "" || node == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Unknown") {
|
phase := strings.TrimSpace(pod.Status.Phase)
|
||||||
|
if !strings.EqualFold(phase, "Unknown") && !strings.EqualFold(phase, "Failed") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if _, badNode := unavailable[node]; badNode {
|
if _, badNode := unavailable[node]; badNode {
|
||||||
@ -273,7 +274,7 @@ func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods pod
|
|||||||
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
|
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
reasons[ns+"/"+name] = "UnknownControllerPodOnReadyNode:" + node
|
reasons[ns+"/"+name] = "StaleControllerPodOnReadyNode:" + node + ":" + phase
|
||||||
}
|
}
|
||||||
return reasons, nil
|
return reasons, nil
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user