recovery: quarantine container runtime wedge nodes

2026-06-19 04:25:39 -03:00 · 2026-06-19 04:25:39 -03:00 · e22a9150e9
commit e22a9150e9
parent 707458cfc5
2 changed files with 216 additions and 1 deletions
--- a/internal/cluster/orchestrator_unit_additional_test.go
+++ b/internal/cluster/orchestrator_unit_additional_test.go
@ -357,6 +357,71 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
 	}
 }
 // TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode runs one orchestration or CLI step.
 // Signature: TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T).
 // Why: a Ready node with a wedged container runtime can trap replacement pods
 // indefinitely; startup should cordon that scheduler target without draining it
 // or touching Longhorn data-plane objects.
 func TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T) {
 	old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
 	lastSeen := time.Now().UTC().Format(time.RFC3339)
 	pods := `{"items":[` +
 		`{"metadata":{"namespace":"logging","name":"oauth2-proxy-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"oauth2-proxy","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
 		`{"metadata":{"namespace":"monitoring","name":"suite-probe-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"suite-probe"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"probe","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
 		`{"metadata":{"namespace":"sso","name":"secret-ensure-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"secret-ensure"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"init","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
 		`{"metadata":{"namespace":"finance","name":"single-node-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"single"}]},"spec":{"nodeName":"titan-19","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CreateContainerError"}}}]}}]}`
 	events := `{"items":[` +
 		`{"metadata":{"namespace":"logging","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"logging","name":"oauth2-proxy-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{oauth2-proxy}: Error: failed to reserve container name oauth2-proxy_logging","lastTimestamp":"` + lastSeen + `"},` +
 		`{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"suite-probe-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{probe}: Error: context deadline exceeded","lastTimestamp":"` + lastSeen + `"},` +
 		`{"metadata":{"namespace":"sso","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"sso","name":"secret-ensure-bad"},"type":"Warning","reason":"Failed","message":"spec.initContainers{init}: Error: failed to reserve container name init_sso","lastTimestamp":"` + lastSeen + `"},` +
 		`{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"single-node-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{app}: Error: failed to reserve container name app_finance","lastTimestamp":"` + lastSeen + `"}]}`
 	cordoned := []string{}
 	deleted := []string{}
 	orch := buildOrchestratorWithStubs(t, config.Config{
 		Startup: config.Startup{StuckPodGraceSeconds: 180},
 	}, []commandStub{
 		{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
 		{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: ""},
 		{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
 		{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
 		{
 			match: func(name string, args []string) bool {
 				if !matchContains("kubectl", "cordon")(name, args) {
 					return false
 				}
 				cordoned = append(cordoned, args[len(args)-1])
 				return true
 			},
 		},
 		{
 			match: func(name string, args []string) bool {
 				if !matchContains("kubectl", "delete", "pod", "--wait=false")(name, args) {
 					return false
 				}
 				joined := strings.Join(args, " ")
 				if strings.Contains(joined, "--force") {
 					t.Fatalf("container-runtime wedge recycle must not force-delete fresh pods")
 				}
 				if len(args) >= 5 {
 					deleted = append(deleted, args[4])
 				}
 				return true
 			},
 		},
 	})
 	if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
 		t.Fatalf("recycleStuckControllerPods failed: %v", err)
 	}
 	if strings.Join(cordoned, ",") != "titan-18" {
 		t.Fatalf("expected only titan-18 to be cordoned, got %#v", cordoned)
 	}
 	if strings.Join(deleted, ",") != "oauth2-proxy-bad,suite-probe-bad,secret-ensure-bad,single-node-bad" {
 		t.Fatalf("expected runtime-wedged pods to be recycled, got %#v", deleted)
 	}
 }
 // TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
 // Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
 // Why: ignored unavailable nodes should be excluded before startup tries SSH,
--- a/internal/cluster/orchestrator_workload_convergence.go
+++ b/internal/cluster/orchestrator_workload_convergence.go
@ -188,6 +188,13 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
 	} else {
 		stalePhaseReasons = reasons
 	}
 	containerRuntimeWedgeReasons := map[string]string{}
 	if reasons, scanErr := o.containerRuntimeWedgePodReasons(ctx, list, grace); scanErr != nil {
 		o.log.Printf("warning: container runtime wedge scan failed: %v", scanErr)
 	} else {
 		containerRuntimeWedgeReasons = reasons
 		o.quarantineContainerRuntimeWedgeNodes(ctx, list, reasons, grace, ignoredNamespaces, ignoredNodes, ignoreRules)
 	}
 	recycled := []string{}
 	for _, pod := range list.Items {
 		ns := strings.TrimSpace(pod.Metadata.Namespace)
@ -224,6 +231,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
 		if reason == "" {
 			reason = stalePhaseReasons[ns+"/"+name]
 		}
 		if runtimeReason := containerRuntimeWedgeReasons[ns+"/"+name]; runtimeReason != "" {
 			reason = runtimeReason
 		}
 		if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) {
 			reason = "StaleDeletingControllerPod"
 		}
@ -254,6 +264,146 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
 	return nil
 }
 // containerRuntimeWedgePodReasons runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
 // Why: after a power event, a node-local container runtime can reserve names and
 // fail every new container start while Kubernetes still reports the node Ready.
 // Detecting the runtime symptom lets startup move work elsewhere without
 // restarting the node or touching storage objects.
 func (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
 	eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
 	if err != nil {
 		return nil, fmt.Errorf("query events for container runtime wedge scan: %w", err)
 	}
 	var events eventList
 	if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
 		return nil, fmt.Errorf("decode events for container runtime wedge scan: %w", err)
 	}
 	runtimeReasons := map[string]struct{}{
 		"CreateContainerError": {},
 		"RunContainerError":    {},
 	}
 	podsByKey := map[string]podResource{}
 	for _, pod := range pods.Items {
 		ns := strings.TrimSpace(pod.Metadata.Namespace)
 		name := strings.TrimSpace(pod.Metadata.Name)
 		node := strings.TrimSpace(pod.Spec.NodeName)
 		if ns == "" || name == "" || node == "" {
 			continue
 		}
 		if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
 			continue
 		}
 		if !podControllerOwned(pod) {
 			continue
 		}
 		if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
 			continue
 		}
 		if stuckContainerReason(pod, runtimeReasons) == "" {
 			continue
 		}
 		podsByKey[ns+"/"+name] = pod
 	}
 	if len(podsByKey) == 0 {
 		return map[string]string{}, nil
 	}
 	reasons := map[string]string{}
 	for _, event := range events.Items {
 		if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
 			continue
 		}
 		if strings.TrimSpace(event.Reason) != "Failed" {
 			continue
 		}
 		if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
 			continue
 		}
 		key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
 		pod, ok := podsByKey[key]
 		if !ok {
 			continue
 		}
 		lastSeen := eventLastObservedAt(event)
 		if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
 			continue
 		}
 		message := strings.ToLower(strings.TrimSpace(event.Message))
 		if !strings.Contains(message, "failed to reserve container name") &&
 			!strings.Contains(message, " is reserved for ") &&
 			!strings.Contains(message, "context deadline exceeded") {
 			continue
 		}
 		reasons[key] = "ContainerRuntimeWedge:" + strings.TrimSpace(pod.Spec.NodeName)
 	}
 	return reasons, nil
 }
 // quarantineContainerRuntimeWedgeNodes runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule).
 // Why: cordoning a proven-bad start node is scheduler-only; it prevents fresh
 // non-storage pods from being trapped while leaving running workloads and
 // Longhorn data-plane state alone.
 func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) {
 	if len(reasons) == 0 {
 		return
 	}
 	const minRuntimeWedgePodsPerNode = 2
 	byNode := map[string][]string{}
 	for _, pod := range pods.Items {
 		ns := strings.TrimSpace(pod.Metadata.Namespace)
 		name := strings.TrimSpace(pod.Metadata.Name)
 		node := strings.TrimSpace(pod.Spec.NodeName)
 		if ns == "" || name == "" || node == "" {
 			continue
 		}
 		key := ns + "/" + name
 		if reasons[key] == "" {
 			continue
 		}
 		if _, ok := ignoredNamespaces[ns]; ok {
 			continue
 		}
 		if workloadIgnored(ignoreRules, ns, "", name) {
 			continue
 		}
 		if podTargetsIgnoredNode(pod, ignoredNodes) {
 			continue
 		}
 		if !podControllerOwned(pod) {
 			continue
 		}
 		if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
 			continue
 		}
 		if podUsesPersistentVolumeClaim(pod) {
 			continue
 		}
 		byNode[node] = append(byNode[node], key)
 	}
 	quarantined := []string{}
 	for node, keys := range byNode {
 		if len(keys) < minRuntimeWedgePodsPerNode {
 			continue
 		}
 		sort.Strings(keys)
 		if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil {
 			o.log.Printf("warning: cordon container-runtime-wedged node %s failed: %v", node, err)
 			continue
 		}
 		o.log.Printf("warning: cordoned node %s after repeated container runtime start failures: %s", node, joinLimited(keys, 8))
 		quarantined = append(quarantined, fmt.Sprintf("%s pods=%d", node, len(keys)))
 	}
 	if len(quarantined) == 0 {
 		return
 	}
 	sort.Strings(quarantined)
 	o.noteStartupAutoHeal(fmt.Sprintf("cordoned container-runtime-wedged node(s): %s", joinLimited(quarantined, 8)))
 }
 // staleControllerPodReasons runs one orchestration or CLI step.
 // Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
 // Why: after node or kubelet recovery, controller-owned pods can stay in
@ -573,7 +723,7 @@ func (o *Orchestrator) longhornUnreadyNodes(ctx context.Context) (map[string]str
 func podControllerOwned(p podResource) bool {
 	for _, owner := range p.Metadata.OwnerReferences {
 		switch strings.TrimSpace(owner.Kind) {
-		case "ReplicaSet", "StatefulSet", "DaemonSet":
+		case "ReplicaSet", "StatefulSet", "DaemonSet", "Job":
 			return true
 		}
 	}