recovery: quarantine container runtime wedge nodes
This commit is contained in:
parent
707458cfc5
commit
e22a9150e9
@ -357,6 +357,71 @@ func TestRecycleStuckControllerPodsHandlesStalePodsOnReadyNodes(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode runs one orchestration or CLI step.
|
||||||
|
// Signature: TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T).
|
||||||
|
// Why: a Ready node with a wedged container runtime can trap replacement pods
|
||||||
|
// indefinitely; startup should cordon that scheduler target without draining it
|
||||||
|
// or touching Longhorn data-plane objects.
|
||||||
|
func TestRecycleStuckControllerPodsCordonsContainerRuntimeWedgeNode(t *testing.T) {
|
||||||
|
old := time.Now().Add(-10 * time.Minute).UTC().Format(time.RFC3339)
|
||||||
|
lastSeen := time.Now().UTC().Format(time.RFC3339)
|
||||||
|
pods := `{"items":[` +
|
||||||
|
`{"metadata":{"namespace":"logging","name":"oauth2-proxy-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"oauth2-proxy"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"oauth2-proxy","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
|
||||||
|
`{"metadata":{"namespace":"monitoring","name":"suite-probe-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"suite-probe"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"probe","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
|
||||||
|
`{"metadata":{"namespace":"sso","name":"secret-ensure-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"Job","name":"secret-ensure"}]},"spec":{"nodeName":"titan-18","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","initContainerStatuses":[{"name":"init","state":{"waiting":{"reason":"CreateContainerError"}}}]}},` +
|
||||||
|
`{"metadata":{"namespace":"finance","name":"single-node-bad","creationTimestamp":"` + old + `","ownerReferences":[{"kind":"ReplicaSet","name":"single"}]},"spec":{"nodeName":"titan-19","volumes":[{"name":"scratch"}]},"status":{"phase":"Pending","containerStatuses":[{"name":"app","state":{"waiting":{"reason":"CreateContainerError"}}}]}}]}`
|
||||||
|
events := `{"items":[` +
|
||||||
|
`{"metadata":{"namespace":"logging","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"logging","name":"oauth2-proxy-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{oauth2-proxy}: Error: failed to reserve container name oauth2-proxy_logging","lastTimestamp":"` + lastSeen + `"},` +
|
||||||
|
`{"metadata":{"namespace":"monitoring","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"monitoring","name":"suite-probe-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{probe}: Error: context deadline exceeded","lastTimestamp":"` + lastSeen + `"},` +
|
||||||
|
`{"metadata":{"namespace":"sso","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"sso","name":"secret-ensure-bad"},"type":"Warning","reason":"Failed","message":"spec.initContainers{init}: Error: failed to reserve container name init_sso","lastTimestamp":"` + lastSeen + `"},` +
|
||||||
|
`{"metadata":{"namespace":"finance","creationTimestamp":"` + lastSeen + `"},"involvedObject":{"kind":"Pod","namespace":"finance","name":"single-node-bad"},"type":"Warning","reason":"Failed","message":"spec.containers{app}: Error: failed to reserve container name app_finance","lastTimestamp":"` + lastSeen + `"}]}`
|
||||||
|
|
||||||
|
cordoned := []string{}
|
||||||
|
deleted := []string{}
|
||||||
|
orch := buildOrchestratorWithStubs(t, config.Config{
|
||||||
|
Startup: config.Startup{StuckPodGraceSeconds: 180},
|
||||||
|
}, []commandStub{
|
||||||
|
{match: matchContains("kubectl", "get", "pods", "-A", "-o", "json"), out: pods},
|
||||||
|
{match: matchContains("kubectl", "-n", "longhorn-system", "get", "nodes.longhorn.io"), out: ""},
|
||||||
|
{match: matchContains("kubectl", "get", "events", "-A", "-o", "json"), out: events},
|
||||||
|
{match: matchContains("kubectl", "get", "nodes", "-o", "json"), out: `{"items":[{"metadata":{"name":"titan-18"},"status":{"conditions":[{"type":"Ready","status":"True"}]}},{"metadata":{"name":"titan-19"},"status":{"conditions":[{"type":"Ready","status":"True"}]}}]}`},
|
||||||
|
{
|
||||||
|
match: func(name string, args []string) bool {
|
||||||
|
if !matchContains("kubectl", "cordon")(name, args) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
cordoned = append(cordoned, args[len(args)-1])
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match: func(name string, args []string) bool {
|
||||||
|
if !matchContains("kubectl", "delete", "pod", "--wait=false")(name, args) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
joined := strings.Join(args, " ")
|
||||||
|
if strings.Contains(joined, "--force") {
|
||||||
|
t.Fatalf("container-runtime wedge recycle must not force-delete fresh pods")
|
||||||
|
}
|
||||||
|
if len(args) >= 5 {
|
||||||
|
deleted = append(deleted, args[4])
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
if err := orch.recycleStuckControllerPods(context.Background()); err != nil {
|
||||||
|
t.Fatalf("recycleStuckControllerPods failed: %v", err)
|
||||||
|
}
|
||||||
|
if strings.Join(cordoned, ",") != "titan-18" {
|
||||||
|
t.Fatalf("expected only titan-18 to be cordoned, got %#v", cordoned)
|
||||||
|
}
|
||||||
|
if strings.Join(deleted, ",") != "oauth2-proxy-bad,suite-probe-bad,secret-ensure-bad,single-node-bad" {
|
||||||
|
t.Fatalf("expected runtime-wedged pods to be recycled, got %#v", deleted)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
|
// TestEffectiveWorkersFiltersIgnoredUnavailableNodes runs one orchestration or CLI step.
|
||||||
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
|
// Signature: TestEffectiveWorkersFiltersIgnoredUnavailableNodes(t *testing.T).
|
||||||
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
|
// Why: ignored unavailable nodes should be excluded before startup tries SSH,
|
||||||
|
|||||||
@ -188,6 +188,13 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
} else {
|
} else {
|
||||||
stalePhaseReasons = reasons
|
stalePhaseReasons = reasons
|
||||||
}
|
}
|
||||||
|
containerRuntimeWedgeReasons := map[string]string{}
|
||||||
|
if reasons, scanErr := o.containerRuntimeWedgePodReasons(ctx, list, grace); scanErr != nil {
|
||||||
|
o.log.Printf("warning: container runtime wedge scan failed: %v", scanErr)
|
||||||
|
} else {
|
||||||
|
containerRuntimeWedgeReasons = reasons
|
||||||
|
o.quarantineContainerRuntimeWedgeNodes(ctx, list, reasons, grace, ignoredNamespaces, ignoredNodes, ignoreRules)
|
||||||
|
}
|
||||||
recycled := []string{}
|
recycled := []string{}
|
||||||
for _, pod := range list.Items {
|
for _, pod := range list.Items {
|
||||||
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||||
@ -224,6 +231,9 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
if reason == "" {
|
if reason == "" {
|
||||||
reason = stalePhaseReasons[ns+"/"+name]
|
reason = stalePhaseReasons[ns+"/"+name]
|
||||||
}
|
}
|
||||||
|
if runtimeReason := containerRuntimeWedgeReasons[ns+"/"+name]; runtimeReason != "" {
|
||||||
|
reason = runtimeReason
|
||||||
|
}
|
||||||
if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) {
|
if reason == "" && staleControllerPodForceDeleteSafe(pod, grace) {
|
||||||
reason = "StaleDeletingControllerPod"
|
reason = "StaleDeletingControllerPod"
|
||||||
}
|
}
|
||||||
@ -254,6 +264,146 @@ func (o *Orchestrator) recycleStuckControllerPods(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// containerRuntimeWedgePodReasons runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||||
|
// Why: after a power event, a node-local container runtime can reserve names and
|
||||||
|
// fail every new container start while Kubernetes still reports the node Ready.
|
||||||
|
// Detecting the runtime symptom lets startup move work elsewhere without
|
||||||
|
// restarting the node or touching storage objects.
|
||||||
|
func (o *Orchestrator) containerRuntimeWedgePodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error) {
|
||||||
|
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("query events for container runtime wedge scan: %w", err)
|
||||||
|
}
|
||||||
|
var events eventList
|
||||||
|
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
|
||||||
|
return nil, fmt.Errorf("decode events for container runtime wedge scan: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
runtimeReasons := map[string]struct{}{
|
||||||
|
"CreateContainerError": {},
|
||||||
|
"RunContainerError": {},
|
||||||
|
}
|
||||||
|
podsByKey := map[string]podResource{}
|
||||||
|
for _, pod := range pods.Items {
|
||||||
|
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||||
|
name := strings.TrimSpace(pod.Metadata.Name)
|
||||||
|
node := strings.TrimSpace(pod.Spec.NodeName)
|
||||||
|
if ns == "" || name == "" || node == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !podControllerOwned(pod) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if stuckContainerReason(pod, runtimeReasons) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
podsByKey[ns+"/"+name] = pod
|
||||||
|
}
|
||||||
|
if len(podsByKey) == 0 {
|
||||||
|
return map[string]string{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
reasons := map[string]string{}
|
||||||
|
for _, event := range events.Items {
|
||||||
|
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(event.Reason) != "Failed" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
|
||||||
|
pod, ok := podsByKey[key]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
lastSeen := eventLastObservedAt(event)
|
||||||
|
if !lastSeen.IsZero() && !pod.Metadata.CreationTimestamp.IsZero() && lastSeen.Before(pod.Metadata.CreationTimestamp) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
message := strings.ToLower(strings.TrimSpace(event.Message))
|
||||||
|
if !strings.Contains(message, "failed to reserve container name") &&
|
||||||
|
!strings.Contains(message, " is reserved for ") &&
|
||||||
|
!strings.Contains(message, "context deadline exceeded") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
reasons[key] = "ContainerRuntimeWedge:" + strings.TrimSpace(pod.Spec.NodeName)
|
||||||
|
}
|
||||||
|
return reasons, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// quarantineContainerRuntimeWedgeNodes runs one orchestration or CLI step.
|
||||||
|
// Signature: (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule).
|
||||||
|
// Why: cordoning a proven-bad start node is scheduler-only; it prevents fresh
|
||||||
|
// non-storage pods from being trapped while leaving running workloads and
|
||||||
|
// Longhorn data-plane state alone.
|
||||||
|
func (o *Orchestrator) quarantineContainerRuntimeWedgeNodes(ctx context.Context, pods podList, reasons map[string]string, grace time.Duration, ignoredNamespaces map[string]struct{}, ignoredNodes map[string]struct{}, ignoreRules []workloadIgnoreRule) {
|
||||||
|
if len(reasons) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const minRuntimeWedgePodsPerNode = 2
|
||||||
|
byNode := map[string][]string{}
|
||||||
|
for _, pod := range pods.Items {
|
||||||
|
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
||||||
|
name := strings.TrimSpace(pod.Metadata.Name)
|
||||||
|
node := strings.TrimSpace(pod.Spec.NodeName)
|
||||||
|
if ns == "" || name == "" || node == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := ns + "/" + name
|
||||||
|
if reasons[key] == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := ignoredNamespaces[ns]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if workloadIgnored(ignoreRules, ns, "", name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if podTargetsIgnoredNode(pod, ignoredNodes) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !podControllerOwned(pod) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !pod.Metadata.CreationTimestamp.IsZero() && time.Since(pod.Metadata.CreationTimestamp) < grace {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if podUsesPersistentVolumeClaim(pod) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
byNode[node] = append(byNode[node], key)
|
||||||
|
}
|
||||||
|
|
||||||
|
quarantined := []string{}
|
||||||
|
for node, keys := range byNode {
|
||||||
|
if len(keys) < minRuntimeWedgePodsPerNode {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
if _, err := o.kubectl(ctx, 30*time.Second, "cordon", node); err != nil {
|
||||||
|
o.log.Printf("warning: cordon container-runtime-wedged node %s failed: %v", node, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
o.log.Printf("warning: cordoned node %s after repeated container runtime start failures: %s", node, joinLimited(keys, 8))
|
||||||
|
quarantined = append(quarantined, fmt.Sprintf("%s pods=%d", node, len(keys)))
|
||||||
|
}
|
||||||
|
if len(quarantined) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sort.Strings(quarantined)
|
||||||
|
o.noteStartupAutoHeal(fmt.Sprintf("cordoned container-runtime-wedged node(s): %s", joinLimited(quarantined, 8)))
|
||||||
|
}
|
||||||
|
|
||||||
// staleControllerPodReasons runs one orchestration or CLI step.
|
// staleControllerPodReasons runs one orchestration or CLI step.
|
||||||
// Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
// Signature: (o *Orchestrator) staleControllerPodReasons(ctx context.Context, pods podList, grace time.Duration) (map[string]string, error).
|
||||||
// Why: after node or kubelet recovery, controller-owned pods can stay in
|
// Why: after node or kubelet recovery, controller-owned pods can stay in
|
||||||
@ -573,7 +723,7 @@ func (o *Orchestrator) longhornUnreadyNodes(ctx context.Context) (map[string]str
|
|||||||
func podControllerOwned(p podResource) bool {
|
func podControllerOwned(p podResource) bool {
|
||||||
for _, owner := range p.Metadata.OwnerReferences {
|
for _, owner := range p.Metadata.OwnerReferences {
|
||||||
switch strings.TrimSpace(owner.Kind) {
|
switch strings.TrimSpace(owner.Kind) {
|
||||||
case "ReplicaSet", "StatefulSet", "DaemonSet":
|
case "ReplicaSet", "StatefulSet", "DaemonSet", "Job":
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user