recovery: recycle stale unknown controller pods
This commit is contained in:
parent
4151254ba1
commit
566765696b
@ -158,6 +158,7 @@ func TestRecycleStuckControllerPodsHandlesLonghornAttachBlockedPods(t *testing.T
|
||||
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-0b\tFalse\n"},
|
||||
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
||||
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-0b"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "-n", "monitoring", "delete", "pod", "victoria-metrics-single-server-0", "--wait=false")(name, args) {
|
||||
@ -195,6 +196,7 @@ func TestRecycleStuckControllerPodsRepairsEncryptedVolumeCryptsetup(t *testing.T
|
||||
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
|
||||
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
||||
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if name != "ssh" || !strings.Contains(strings.Join(args, " "), "apt-get install -y --no-install-recommends cryptsetup-bin") {
|
||||
@ -245,6 +247,7 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *
|
||||
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-19\tTrue\n"},
|
||||
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
||||
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
||||
{
|
||||
match: matchContains("ssh", "apt-get install -y --no-install-recommends cryptsetup-bin"),
|
||||
err: errors.New("sudo: a password is required"),
|
||||
@ -280,6 +283,46 @@ func TestRecycleStuckControllerPodsCordonsEncryptedVolumeNodeWhenRepairFails(t *
|
||||
}
|
||||
}
|
||||
|
||||
// TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes runs one orchestration or CLI step.
|
||||
// Signature: TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T).
|
||||
// Why: post-outage controller pods can remain Unknown after their node recovers;
|
||||
// normal deletion clears stale status without force-deleting or touching storage.
|
||||
func TestRecycleStuckControllerPodsHandlesUnknownPodsOnReadyNodes(t *testing.T) {
|
||||
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||
recent := time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)
|
||||
pods := `{"items":[` +
|
||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-old","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
||||
`{"metadata":{"namespace":"longhorn-system","name":"longhorn-vault-sync-fresh","creationTimestamp":"` + recent + `","ownerReferences":[{"kind":"ReplicaSet","name":"longhorn-vault-sync"}]},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}},` +
|
||||
`{"metadata":{"namespace":"maintenance","name":"stale-on-bad-node","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"maintenance"}]},"spec":{"nodeName":"titan-22"},"status":{"phase":"Unknown"}},` +
|
||||
`{"metadata":{"namespace":"default","name":"bare-pod","creationTimestamp":"` + old + `"},"spec":{"nodeName":"titan-12"},"status":{"phase":"Unknown"}}]}`
|
||||
|
||||
deleted := []string{}
|
||||
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
||||
}, []commandStub{
|
||||
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: "titan-12\tTrue\ntitan-22\tTrue\n"},
|
||||
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: `{"items":[]}`},
|
||||
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-12"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-22"},"status":{"conditions":[{"type":"Ready","status":"False"}]}}]}`},
|
||||
{
|
||||
match: func(name string, args []string) bool {
|
||||
if !matchContains("kubectl", "-n", "longhorn-system", "delete", "pod", "longhorn-vault-sync-old", "--wait=false")(name, args) {
|
||||
return false
|
||||
}
|
||||
deleted = append(deleted, "longhorn-vault-sync-old")
|
||||
return true
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
||||
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
||||
}
|
||||
if len(deleted) != 1 || deleted[0] != "longhorn-vault-sync-old" {
|
||||
t.Fatalf("expected only old Unknown controller pod on Ready node to be recycled, got %#v", deleted)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewConstructsOrchestrator runs one orchestration or CLI step.
|
||||
// Signature: TestNewConstructsOrchestrator(t *testing.T).
|
||||
// Why: covers constructor path in orchestrator core module.
|
||||
|
||||
@ -182,6 +182,12 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
} else {
|
||||
encryptedMountReasons = reasons
|
||||
}
|
||||
unknownPhaseReasons := map[string]string{}
|
||||
if reasons, scanErr := o.unknownControllerPodReasons(ctx, list, grace); scanErr != nil {
|
||||
o.log.Printf("warning: unknown controller pod scan failed: %v", scanErr)
|
||||
} else {
|
||||
unknownPhaseReasons = reasons
|
||||
}
|
||||
recycled := []string{}
|
||||
for _, pod := range list.Items {
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
@ -215,6 +221,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
if reason == "" {
|
||||
reason = encryptedMountReasons[ns+"/"+name]
|
||||
}
|
||||
if reason == "" {
|
||||
reason = unknownPhaseReasons[ns+"/"+name]
|
||||
}
|
||||
if reason == "" {
|
||||
continue
|
||||
}
|
||||
@ -233,6 +242,42 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// unknownControllerPodReasons runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||
// Why: after node or kubelet recovery, controller-owned pods can stay in
|
||||
// Unknown even though the node is Ready and a replacement may already be
|
||||
// healthy. A normal pod delete lets Kubernetes clean the stale status without
|
||||
// touching storage objects or forcing deletion on a partitioned node.
|
||||
func (o *Orchestrator) unknownControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
|
||||
unavailable, err := o.unavailableNodeSet(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reasons := map[string]string{}
|
||||
for _, pod := range pods.Items {
|
||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||
name := strings.TrimSpace(pod.Metadata.Name)
|
||||
node := strings.TrimSpace(pod.Spec.NodeName)
|
||||
if ns == "" || name == "" || node == "" {
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Unknown") {
|
||||
continue
|
||||
}
|
||||
if _, badNode := unavailable[node]; badNode {
|
||||
continue
|
||||
}
|
||||
if !podControllerOwned(pod) {
|
||||
continue
|
||||
}
|
||||
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
|
||||
continue
|
||||
}
|
||||
reasons[ns+"/"+name] = "UnknownControllerPodOnReadyNode:" + node
|
||||
}
|
||||
return reasons, nil
|
||||
}
|
||||
|
||||
// repairEncryptedVolumeMountPrereqs runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) repairEncryptedVolumeMountPrereqs(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||
// Why: encrypted Longhorn volume mounts depend on host cryptsetup. After node
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user