diff --git a/internal/cluster/orchestrator_unit_additional_test.go b/internal/cluster/orchestrator_unit_additional_test.go index 1aafade..f03a562 100644 --- a/internal/cluster/orchestrator_unit_additional_test.go +++ b/internal/cluster/orchestrator_unit_additional_test.go @@ -357,6 +357,71 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) { } } +// TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode runs one orchestration or CLI step. +// Signature: TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T). +// Why: a Ready node with a wedged container runtime can trap replacement pods +// indefinitely; startup should cordon that scheduler target without draining it +// or touching Longhorn data-plane objects. +func TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T) { + old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339) + lastSeen := time.Now().UTC().Format(time.RFC3339) + pods := `{"items":[` + + `{"metadata":{"namespace":"logging","name":"oauth2-proxy-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"oauth2-proxy","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` + + `{"metadata":{"namespace":"monitoring","name":"suite-probe-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"suite-probe"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"probe","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` + + `{"metadata":{"namespace":"sso","name":"secret-ensure-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"secret-ensure"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"init","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` + + `{"metadata":{"namespace":"finance","name":"single-node-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"single"}]},"spec":{"nodeName":"titan-19","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CreateContainerError"}}}]}}]}` + events := `{"items":[` + + `{"metadata":{"namespace":"logging","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"logging","name":"oauth2-proxy-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{oauth2-proxy}: Error: failed to reserve container name oauth2-proxy_logging","lastTimestamp":"` + lastSeen + `"},` + + `{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"suite-probe-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{probe}: Error: context deadline exceeded","lastTimestamp":"` + lastSeen + `"},` + + `{"metadata":{"namespace":"sso","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"sso","name":"secret-ensure-bad"},"type":"Warning","reason":"Failed","message":"spec.initContainers{init}: Error: failed to reserve container name init_sso","lastTimestamp":"` + lastSeen + `"},` + + `{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"single-node-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{app}: Error: failed to reserve container name app_finance","lastTimestamp":"` + lastSeen + `"}]}` + + cordoned := []string{} + deleted := []string{} + orch := buildOrchestratorWithStubs(t, config.Config{ + Startup: config.Startup{StuckPodGraceSeconds: 180}, + }, []commandStub{ + {match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods}, + {match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: ""}, + {match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events}, + {match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`}, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "cordon")(name, args) { + return false + } + cordoned = append(cordoned, args[len(args)-1]) + return true + }, + }, + { + match: func(name string, args []string) bool { + if !matchContains("kubectl", "delete", "pod", "--wait=false")(name, args) { + return false + } + joined := strings.Join(args, " ") + if strings.Contains(joined, "--force") { + t.Fatalf("container-runtime wedge recycle must not force-delete fresh pods") + } + if len(args) >= 5 { + deleted = append(deleted, args[4]) + } + return true + }, + }, + }) + + if err := orch.recycleStuckControllerPods(context.Background()); err != nil { + t.Fatalf("recycleStuckControllerPods failed: %v", err) + } + if strings.Join(cordoned, ",") != "titan-18" { + t.Fatalf("expected only titan-18 to be cordoned, got %#v", cordoned) + } + if strings.Join(deleted, ",") != "oauth2-proxy-bad,suite-probe-bad,secret-ensure-bad,single-node-bad" { + t.Fatalf("expected runtime-wedged pods to be recycled, got %#v", deleted) + } +} + // TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step. // Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T). // Why: ignored unavailable nodes should be excluded before startup tries SSH, diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index d6943f8..3d87b47 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -188,6 +188,13 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { } else { stalePhaseReasons = reasons } + containerRuntimeWedgeReasons := map[string]string{} + if reasons, scanErr := o.containerRuntimeWedgePodReasons(ctx, list, grace); scanErr != nil { + o.log.Printf("warning: container runtime wedge scan failed: %v", scanErr) + } else { + containerRuntimeWedgeReasons = reasons + o.quarantineContainerRuntimeWedgeNodes(ctx, list, reasons, grace, ignoredNamespaces, ignoredNodes, ignoreRules) + } recycled := []string{} for _, pod := range list.Items { ns := strings.TrimSpace(pod.Metadata.Namespace) @@ -224,6 +231,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { if reason == "" { reason = stalePhaseReasons[ns+"/"+name] } + if runtimeReason := containerRuntimeWedgeReasons[ns+"/"+name]; runtimeReason != "" { + reason = runtimeReason + } if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) { reason = "StaleDeletingControllerPod" } @@ -254,6 +264,146 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { return nil } +// containerRuntimeWedgePodReasons runs one orchestration or CLI step. +// Signature: (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). +// Why: after a power event, a node-local container runtime can reserve names and +// fail every new container start while Kubernetes still reports the node Ready. +// Detecting the runtime symptom lets startup move work elsewhere without +// restarting the node or touching storage objects. +func (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) { + eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query events for container runtime wedge scan: %w", err) + } + var events eventList + if err := json.Unmarshal([]byte(eventsOut), &events); err != nil { + return nil, fmt.Errorf("decode events for container runtime wedge scan: %w", err) + } + + runtimeReasons := map[string]struct{}{ + "CreateContainerError": {}, + "RunContainerError": {}, + } + podsByKey := map[string]podResource{} + for _, pod := range pods.Items { + ns := strings.TrimSpace(pod.Metadata.Namespace) + name := strings.TrimSpace(pod.Metadata.Name) + node := strings.TrimSpace(pod.Spec.NodeName) + if ns == "" || name == "" || node == "" { + continue + } + if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") { + continue + } + if !podControllerOwned(pod) { + continue + } + if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace { + continue + } + if stuckContainerReason(pod, runtimeReasons) == "" { + continue + } + podsByKey[ns+"/"+name] = pod + } + if len(podsByKey) == 0 { + return map[string]string{}, nil + } + + reasons := map[string]string{} + for _, event := range events.Items { + if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") { + continue + } + if strings.TrimSpace(event.Reason) != "Failed" { + continue + } + if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") { + continue + } + key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name) + pod, ok := podsByKey[key] + if !ok { + continue + } + lastSeen := eventLastObservedAt(event) + if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) { + continue + } + message := strings.ToLower(strings.TrimSpace(event.Message)) + if !strings.Contains(message, "failed to reserve container name") && + !strings.Contains(message, " is reserved for ") && + !strings.Contains(message, "context deadline exceeded") { + continue + } + reasons[key] = "ContainerRuntimeWedge:" + strings.TrimSpace(pod.Spec.NodeName) + } + return reasons, nil +} + +// quarantineContainerRuntimeWedgeNodes runs one orchestration or CLI step. +// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule). +// Why: cordoning a proven-bad start node is scheduler-only; it prevents fresh +// non-storage pods from being trapped while leaving running workloads and +// Longhorn data-plane state alone. +func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) { + if len(reasons) == 0 { + return + } + const minRuntimeWedgePodsPerNode = 2 + byNode := map[string][]string{} + for _, pod := range pods.Items { + ns := strings.TrimSpace(pod.Metadata.Namespace) + name := strings.TrimSpace(pod.Metadata.Name) + node := strings.TrimSpace(pod.Spec.NodeName) + if ns == "" || name == "" || node == "" { + continue + } + key := ns + "/" + name + if reasons[key] == "" { + continue + } + if _, ok := ignoredNamespaces[ns]; ok { + continue + } + if workloadIgnored(ignoreRules, ns, "", name) { + continue + } + if podTargetsIgnoredNode(pod, ignoredNodes) { + continue + } + if !podControllerOwned(pod) { + continue + } + if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace { + continue + } + if podUsesPersistentVolumeClaim(pod) { + continue + } + byNode[node] = append(byNode[node], key) + } + + quarantined := []string{} + for node, keys := range byNode { + if len(keys) < minRuntimeWedgePodsPerNode { + continue + } + sort.Strings(keys) + if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil { + o.log.Printf("warning: cordon container-runtime-wedged node %s failed: %v", node, err) + continue + } + o.log.Printf("warning: cordoned node %s after repeated container runtime start failures: %s", node, joinLimited(keys, 8)) + quarantined = append(quarantined, fmt.Sprintf("%s pods=%d", node, len(keys))) + } + if len(quarantined) == 0 { + return + } + sort.Strings(quarantined) + o.noteStartupAutoHeal(fmt.Sprintf("cordoned container-runtime-wedged node(s): %s", joinLimited(quarantined, 8))) +} + // staleControllerPodReasons runs one orchestration or CLI step. // Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Why: after node or kubelet recovery, controller-owned pods can stay in @@ -573,7 +723,7 @@ func (o *Orchestrator) longhornUnreadyNodes(ctx context.Context) (map[string]str func podControllerOwned(p podResource) bool { for _, owner := range p.Metadata.OwnerReferences { switch strings.TrimSpace(owner.Kind) { - case "ReplicaSet", "StatefulSet", "DaemonSet": + case "ReplicaSet", "StatefulSet", "DaemonSet", "Job": return true } }