From b3076a23a926777564a933d8187c82a7c466cbf1 Mon Sep 17 00:00:00 2001 From: codex Date: Fri, 19 Jun 2026 04:31:36 -0300 Subject: [PATCH] recovery: skip runtime-wedged workers during startup --- internal/cluster/orchestrator_lifecycle.go | 21 +++++++++ .../orchestrator_workload_convergence.go | 43 +++++++++++++++++-- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/internal/cluster/orchestrator_lifecycle.go b/internal/cluster/orchestrator_lifecycle.go index fa9640f..d413e77 100644 --- a/internal/cluster/orchestrator_lifecycle.go +++ b/internal/cluster/orchestrator_lifecycle.go @@ -215,6 +215,27 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er } o.log.Printf("startup workers=%s", strings.Join(workers, ",")) o.setStartupPhase("worker-start", "starting and uncordoning worker nodes") + runtimeWedgedNodes := []string{} + o.bestEffort("quarantine container-runtime-wedged nodes", func() error { + nodes, err := o.quarantineContainerRuntimeWedgeNodesFromCluster(ctx) + if err != nil { + return err + } + runtimeWedgedNodes = nodes + return nil + }) + if len(runtimeWedgedNodes) > 0 { + ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + for _, node := range runtimeWedgedNodes { + if _, ok := ignored[node]; ok { + continue + } + o.cfg.Startup.IgnoreUnavailableNodes = append(o.cfg.Startup.IgnoreUnavailableNodes, node) + ignored[node] = struct{}{} + } + workers = filterIgnoredNodes(workers, o.cfg.Startup.IgnoreUnavailableNodes) + o.log.Printf("startup workers after container-runtime quarantine=%s", strings.Join(workers, ",")) + } if o.cfg.Startup.ReconcileAccessOnBoot { o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) }) } diff --git a/internal/cluster/orchestrator_workload_convergence.go b/internal/cluster/orchestrator_workload_convergence.go index 3d87b47..01c3d9d 100644 --- a/internal/cluster/orchestrator_workload_convergence.go +++ b/internal/cluster/orchestrator_workload_convergence.go @@ -264,6 +264,33 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error { return nil } +// quarantineContainerRuntimeWedgeNodesFromCluster runs one orchestration or CLI step. +// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodesFromCluster(ctx context.Context) ([]string, error). +// Why: worker startup needs this scan before SSH-heavy steps so a Ready but +// container-runtime-wedged node cannot stall the whole recovery run. +func (o *Orchestrator) quarantineContainerRuntimeWedgeNodesFromCluster(ctx context.Context) ([]string, error) { + out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") + if err != nil { + return nil, fmt.Errorf("query pods for container runtime wedge scan: %w", err) + } + var list podList + if err := json.Unmarshal([]byte(out), &list); err != nil { + return nil, fmt.Errorf("decode pods for container runtime wedge scan: %w", err) + } + grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second + if grace <= 0 { + grace = 180 * time.Second + } + reasons, err := o.containerRuntimeWedgePodReasons(ctx, list, grace) + if err != nil { + return nil, err + } + ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) + ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) + ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads) + return o.quarantineContainerRuntimeWedgeNodes(ctx, list, reasons, grace, ignoredNamespaces, ignoredNodes, ignoreRules), nil +} + // containerRuntimeWedgePodReasons runs one orchestration or CLI step. // Signature: (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error). // Why: after a power event, a node-local container runtime can reserve names and @@ -342,13 +369,13 @@ func (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods } // quarantineContainerRuntimeWedgeNodes runs one orchestration or CLI step. -// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule). +// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) []string. // Why: cordoning a proven-bad start node is scheduler-only; it prevents fresh // non-storage pods from being trapped while leaving running workloads and // Longhorn data-plane state alone. -func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) { +func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) []string { if len(reasons) == 0 { - return + return nil } const minRuntimeWedgePodsPerNode = 2 byNode := map[string][]string{} @@ -398,10 +425,18 @@ func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, quarantined = append(quarantined, fmt.Sprintf("%s pods=%d", node, len(keys))) } if len(quarantined) == 0 { - return + return nil } sort.Strings(quarantined) o.noteStartupAutoHeal(fmt.Sprintf("cordoned container-runtime-wedged node(s): %s", joinLimited(quarantined, 8))) + nodes := make([]string, 0, len(quarantined)) + for _, item := range quarantined { + fields := strings.Fields(item) + if len(fields) > 0 { + nodes = append(nodes, fields[0]) + } + } + return nodes } // staleControllerPodReasons runs one orchestration or CLI step.