recovery: quarantine container runtime wedge nodes

This commit is contained in:
codex 2026-06-19 04:25:39 -03:00
parent 707458cfc5
commit e22a9150e9
2 changed files with 216 additions and 1 deletions

View File

@ -357,6 +357,71 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
} }
} }
// TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode runs one orchestration or CLI step.
// Signature: TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T).
// Why: a Ready node with a wedged container runtime can trap replacement pods
// indefinitely; startup should cordon that scheduler target without draining it
// or touching Longhorn data-plane objects.
func TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T) {
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
lastSeen := time.Now().UTC().Format(time.RFC3339)
pods := `{"items":[` +
`{"metadata":{"namespace":"logging","name":"oauth2-proxy-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"oauth2-proxy","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
`{"metadata":{"namespace":"monitoring","name":"suite-probe-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"suite-probe"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"probe","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
`{"metadata":{"namespace":"sso","name":"secret-ensure-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"secret-ensure"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"init","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
`{"metadata":{"namespace":"finance","name":"single-node-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"single"}]},"spec":{"nodeName":"titan-19","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CreateContainerError"}}}]}}]}`
events := `{"items":[` +
`{"metadata":{"namespace":"logging","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"logging","name":"oauth2-proxy-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{oauth2-proxy}: Error: failed to reserve container name oauth2-proxy_logging","lastTimestamp":"` + lastSeen + `"},` +
`{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"suite-probe-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{probe}: Error: context deadline exceeded","lastTimestamp":"` + lastSeen + `"},` +
`{"metadata":{"namespace":"sso","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"sso","name":"secret-ensure-bad"},"type":"Warning","reason":"Failed","message":"spec.initContainers{init}: Error: failed to reserve container name init_sso","lastTimestamp":"` + lastSeen + `"},` +
`{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"single-node-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{app}: Error: failed to reserve container name app_finance","lastTimestamp":"` + lastSeen + `"}]}`
cordoned := []string{}
deleted := []string{}
orch := buildOrchestratorWithStubs(t, config.Config{
Startup: config.Startup{StuckPodGraceSeconds: 180},
}, []commandStub{
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: ""},
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "cordon")(name, args) {
return false
}
cordoned = append(cordoned, args[len(args)-1])
return true
},
},
{
match: func(name string, args []string) bool {
if !matchContains("kubectl", "delete", "pod", "--wait=false")(name, args) {
return false
}
joined := strings.Join(args, " ")
if strings.Contains(joined, "--force") {
t.Fatalf("container-runtime wedge recycle must not force-delete fresh pods")
}
if len(args) >= 5 {
deleted = append(deleted, args[4])
}
return true
},
},
})
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
t.Fatalf("recycleStuckControllerPods failed: %v", err)
}
if strings.Join(cordoned, ",") != "titan-18" {
t.Fatalf("expected only titan-18 to be cordoned, got %#v", cordoned)
}
if strings.Join(deleted, ",") != "oauth2-proxy-bad,suite-probe-bad,secret-ensure-bad,single-node-bad" {
t.Fatalf("expected runtime-wedged pods to be recycled, got %#v", deleted)
}
}
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step. // TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T). // Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
// Why: ignored unavailable nodes should be excluded before startup tries SSH, // Why: ignored unavailable nodes should be excluded before startup tries SSH,

View File

@ -188,6 +188,13 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
} else { } else {
stalePhaseReasons = reasons stalePhaseReasons = reasons
} }
containerRuntimeWedgeReasons := map[string]string{}
if reasons, scanErr := o.containerRuntimeWedgePodReasons(ctx, list, grace); scanErr != nil {
o.log.Printf("warning: container runtime wedge scan failed: %v", scanErr)
} else {
containerRuntimeWedgeReasons = reasons
o.quarantineContainerRuntimeWedgeNodes(ctx, list, reasons, grace, ignoredNamespaces, ignoredNodes, ignoreRules)
}
recycled := []string{} recycled := []string{}
for _, pod := range list.Items { for _, pod := range list.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace) ns := strings.TrimSpace(pod.Metadata.Namespace)
@ -224,6 +231,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
if reason == "" { if reason == "" {
reason = stalePhaseReasons[ns+"/"+name] reason = stalePhaseReasons[ns+"/"+name]
} }
if runtimeReason := containerRuntimeWedgeReasons[ns+"/"+name]; runtimeReason != "" {
reason = runtimeReason
}
if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) { if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) {
reason = "StaleDeletingControllerPod" reason = "StaleDeletingControllerPod"
} }
@ -254,6 +264,146 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
return nil return nil
} }
// containerRuntimeWedgePodReasons runs one orchestration or CLI step.
// Signature: (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: after a power event, a node-local container runtime can reserve names and
// fail every new container start while Kubernetes still reports the node Ready.
// Detecting the runtime symptom lets startup move work elsewhere without
// restarting the node or touching storage objects.
func (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return nil, fmt.Errorf("query events for container runtime wedge scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return nil, fmt.Errorf("decode events for container runtime wedge scan: %w", err)
}
runtimeReasons := map[string]struct{}{
"CreateContainerError": {},
"RunContainerError": {},
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
node := strings.TrimSpace(pod.Spec.NodeName)
if ns == "" || name == "" || node == "" {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
if !podControllerOwned(pod) {
continue
}
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
continue
}
if stuckContainerReason(pod, runtimeReasons) == "" {
continue
}
podsByKey[ns+"/"+name] = pod
}
if len(podsByKey) == 0 {
return map[string]string{}, nil
}
reasons := map[string]string{}
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "Failed" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[key]
if !ok {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
continue
}
message := strings.ToLower(strings.TrimSpace(event.Message))
if !strings.Contains(message, "failed to reserve container name") &&
!strings.Contains(message, " is reserved for ") &&
!strings.Contains(message, "context deadline exceeded") {
continue
}
reasons[key] = "ContainerRuntimeWedge:" + strings.TrimSpace(pod.Spec.NodeName)
}
return reasons, nil
}
// quarantineContainerRuntimeWedgeNodes runs one orchestration or CLI step.
// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule).
// Why: cordoning a proven-bad start node is scheduler-only; it prevents fresh
// non-storage pods from being trapped while leaving running workloads and
// Longhorn data-plane state alone.
func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) {
if len(reasons) == 0 {
return
}
const minRuntimeWedgePodsPerNode = 2
byNode := map[string][]string{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
node := strings.TrimSpace(pod.Spec.NodeName)
if ns == "" || name == "" || node == "" {
continue
}
key := ns + "/" + name
if reasons[key] == "" {
continue
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}
if workloadIgnored(ignoreRules, ns, "", name) {
continue
}
if podTargetsIgnoredNode(pod, ignoredNodes) {
continue
}
if !podControllerOwned(pod) {
continue
}
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
continue
}
if podUsesPersistentVolumeClaim(pod) {
continue
}
byNode[node] = append(byNode[node], key)
}
quarantined := []string{}
for node, keys := range byNode {
if len(keys) < minRuntimeWedgePodsPerNode {
continue
}
sort.Strings(keys)
if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil {
o.log.Printf("warning: cordon container-runtime-wedged node %s failed: %v", node, err)
continue
}
o.log.Printf("warning: cordoned node %s after repeated container runtime start failures: %s", node, joinLimited(keys, 8))
quarantined = append(quarantined, fmt.Sprintf("%s pods=%d", node, len(keys)))
}
if len(quarantined) == 0 {
return
}
sort.Strings(quarantined)
o.noteStartupAutoHeal(fmt.Sprintf("cordoned container-runtime-wedged node(s): %s", joinLimited(quarantined, 8)))
}
// staleControllerPodReasons runs one orchestration or CLI step. // staleControllerPodReasons runs one orchestration or CLI step.
// Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
// Why: after node or kubelet recovery, controller-owned pods can stay in // Why: after node or kubelet recovery, controller-owned pods can stay in
@ -573,7 +723,7 @@ func (o *Orchestrator) longhornUnreadyNodes(ctx context.Context) (map[string]str
func podControllerOwned(p podResource) bool { func podControllerOwned(p podResource) bool {
for _, owner := range p.Metadata.OwnerReferences { for _, owner := range p.Metadata.OwnerReferences {
switch strings.TrimSpace(owner.Kind) { switch strings.TrimSpace(owner.Kind) {
case "ReplicaSet", "StatefulSet", "DaemonSet": case "ReplicaSet", "StatefulSet", "DaemonSet", "Job":
return true return true
} }
} }