recovery: skip runtime-wedged workers during startup
This commit is contained in:
parent
e22a9150e9
commit
b3076a23a9
@ -215,6 +215,27 @@ func (o *Orchestrator) Startup(ctx context.Context, opts StartupOptions) (err er
|
||||
}
|
||||
o.log.Printf("startup workers=%s", strings.Join(workers, ","))
|
||||
o.setStartupPhase("worker-start", "starting and uncordoning worker nodes")
|
||||
runtimeWedgedNodes := []string{}
|
||||
o.bestEffort("quarantine container-runtime-wedged nodes", func() error {
|
||||
nodes, err := o.quarantineContainerRuntimeWedgeNodesFromCluster(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
runtimeWedgedNodes = nodes
|
||||
return nil
|
||||
})
|
||||
if len(runtimeWedgedNodes) > 0 {
|
||||
ignored := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
for _, node := range runtimeWedgedNodes {
|
||||
if _, ok := ignored[node]; ok {
|
||||
continue
|
||||
}
|
||||
o.cfg.Startup.IgnoreUnavailableNodes = append(o.cfg.Startup.IgnoreUnavailableNodes, node)
|
||||
ignored[node] = struct{}{}
|
||||
}
|
||||
workers = filterIgnoredNodes(workers, o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
o.log.Printf("startup workers after container-runtime quarantine=%s", strings.Join(workers, ","))
|
||||
}
|
||||
if o.cfg.Startup.ReconcileAccessOnBoot {
|
||||
o.bestEffort("reconcile worker access", func() error { return o.reconcileNodeAccess(ctx, workers) })
|
||||
}
|
||||
|
||||
@ -264,6 +264,33 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// quarantineContainerRuntimeWedgeNodesFromCluster runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodesFromCluster(ctx context.Context) ([]string, error).
|
||||
// Why: worker startup needs this scan before SSH-heavy steps so a Ready but
|
||||
// container-runtime-wedged node cannot stall the whole recovery run.
|
||||
func (o *Orchestrator) quarantineContainerRuntimeWedgeNodesFromCluster(ctx context.Context) ([]string, error) {
|
||||
out, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query pods for container runtime wedge scan: %w", err)
|
||||
}
|
||||
var list podList
|
||||
if err := json.Unmarshal([]byte(out), &list); err != nil {
|
||||
return nil, fmt.Errorf("decode pods for container runtime wedge scan: %w", err)
|
||||
}
|
||||
grace := time.Duration(o.cfg.Startup.StuckPodGraceSeconds) * time.Second
|
||||
if grace <= 0 {
|
||||
grace = 180 * time.Second
|
||||
}
|
||||
reasons, err := o.containerRuntimeWedgePodReasons(ctx, list, grace)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
||||
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
||||
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
||||
return o.quarantineContainerRuntimeWedgeNodes(ctx, list, reasons, grace, ignoredNamespaces, ignoredNodes, ignoreRules), nil
|
||||
}
|
||||
|
||||
// containerRuntimeWedgePodReasons runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||
// Why: after a power event, a node-local container runtime can reserve names and
|
||||
@ -342,13 +369,13 @@ func (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods
|
||||
}
|
||||
|
||||
// quarantineContainerRuntimeWedgeNodes runs one orchestration or CLI step.
|
||||
// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule).
|
||||
// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) []string.
|
||||
// Why: cordoning a proven-bad start node is scheduler-only; it prevents fresh
|
||||
// non-storage pods from being trapped while leaving running workloads and
|
||||
// Longhorn data-plane state alone.
|
||||
func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) {
|
||||
func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) []string {
|
||||
if len(reasons) == 0 {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
const minRuntimeWedgePodsPerNode = 2
|
||||
byNode := map[string][]string{}
|
||||
@ -398,10 +425,18 @@ func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context,
|
||||
quarantined = append(quarantined, fmt.Sprintf("%s pods=%d", node, len(keys)))
|
||||
}
|
||||
if len(quarantined) == 0 {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
sort.Strings(quarantined)
|
||||
o.noteStartupAutoHeal(fmt.Sprintf("cordoned container-runtime-wedged node(s): %s", joinLimited(quarantined, 8)))
|
||||
nodes := make([]string, 0, len(quarantined))
|
||||
for _, item := range quarantined {
|
||||
fields := strings.Fields(item)
|
||||
if len(fields) > 0 {
|
||||
nodes = append(nodes, fields[0])
|
||||
}
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
// staleControllerPodReasons runs one orchestration or CLI step.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user