package cluster import ( "context" "encoding/json" "fmt" "sort" "strings" "time" ) // maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step. // Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time). // Why: a non-core workload that cannot schedule can emit enough warning events to // thrash the control plane datastore; quarantine keeps startup moving while // preserving core services. func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) { if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms { return } now := time.Now() if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second { return } if lastAttempt != nil { *lastAttempt = now } o.bestEffort("quarantine non-core scheduling storm workloads", func() error { return o.quarantineSchedulingStormWorkloads(ctx) }) } // quarantineSchedulingStormWorkloads runs one orchestration or CLI step. // Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error. // Why: limits startup-only mitigation to workloads proven to be generating a // scheduling event storm, instead of scaling optional apps down blindly. func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error { podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json") if err != nil { return fmt.Errorf("query pods for scheduling storm scan: %w", err) } var pods podList if err := json.Unmarshal([]byte(podsOut), &pods); err != nil { return fmt.Errorf("decode pods for scheduling storm scan: %w", err) } rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json") if err != nil { return fmt.Errorf("query replicasets for scheduling storm scan: %w", err) } var rsList replicaSetList if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil { return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err) } eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json") if err != nil { return fmt.Errorf("query events for scheduling storm scan: %w", err) } var events eventList if err := json.Unmarshal([]byte(eventsOut), &events); err != nil { return fmt.Errorf("decode events for scheduling storm scan: %w", err) } workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json") if err != nil { return fmt.Errorf("query workloads for scheduling storm scan: %w", err) } var workloads workloadList if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil { return fmt.Errorf("decode workloads for scheduling storm scan: %w", err) } requiredNamespaces := o.startupRequiredWorkloadNamespaces() ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces) ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes) ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads) eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold if eventThreshold <= 0 { eventThreshold = 30 } window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second if window <= 0 { window = 3 * time.Minute } podsByKey := map[string]podResource{} for _, pod := range pods.Items { ns := strings.TrimSpace(pod.Metadata.Namespace) name := strings.TrimSpace(pod.Metadata.Name) if ns == "" || name == "" { continue } podsByKey[ns+"/"+name] = pod } rsOwners := map[string]ownerReference{} for _, rs := range rsList.Items { ns := strings.TrimSpace(rs.Metadata.Namespace) name := strings.TrimSpace(rs.Metadata.Name) if ns == "" || name == "" { continue } for _, owner := range rs.Metadata.OwnerReferences { kind := strings.TrimSpace(owner.Kind) ownerName := strings.TrimSpace(owner.Name) if kind == "" || ownerName == "" { continue } rsOwners[ns+"/"+name] = owner break } } workloadDesired := map[string]int32{} for _, item := range workloads.Items { kind := strings.ToLower(strings.TrimSpace(item.Kind)) ns := strings.TrimSpace(item.Metadata.Namespace) name := strings.TrimSpace(item.Metadata.Name) if kind == "" || ns == "" || name == "" { continue } desired, _, ok := desiredReady(item) if !ok { continue } workloadDesired[ns+"/"+kind+"/"+name] = desired } quarantined := []string{} seen := map[string]struct{}{} now := time.Now() for _, event := range events.Items { if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") { continue } if strings.TrimSpace(event.Reason) != "FailedScheduling" { continue } if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") { continue } lastSeen := eventLastObservedAt(event) if !lastSeen.IsZero() && now.Sub(lastSeen) > window { continue } count := eventObservationCount(event) if count < eventThreshold { continue } podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name) pod, ok := podsByKey[podKey] if !ok { continue } if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") { continue } ns := strings.TrimSpace(pod.Metadata.Namespace) if _, ok := requiredNamespaces[ns]; ok { continue } if _, ok := ignoredNamespaces[ns]; ok { continue } if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) { continue } if podTargetsIgnoredNode(pod, ignoredNodes) { continue } workload, ok := schedulingStormOwnerWorkload(pod, rsOwners) if !ok { continue } if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) { continue } workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name if _, done := seen[workloadKey]; done { continue } desired := workloadDesired[workloadKey] if desired <= 0 { continue } if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil { return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err) } seen[workloadKey] = struct{}{} quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window)) } if len(quarantined) == 0 { return nil } sort.Strings(quarantined) detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8)) o.log.Printf("%s", detail) o.noteStartupAutoHeal(detail) return nil } // schedulingStormOwnerWorkload runs one orchestration or CLI step. // Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool). // Why: scheduling storms happen at the pod layer, but safe mitigation needs to // operate on the owning deployment or statefulset. func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) { ns := strings.TrimSpace(pod.Metadata.Namespace) for _, owner := range pod.Metadata.OwnerReferences { switch strings.TrimSpace(owner.Kind) { case "StatefulSet": if name := strings.TrimSpace(owner.Name); name != "" { return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true } case "ReplicaSet": rsName := strings.TrimSpace(owner.Name) if rsName == "" { continue } rsOwner, ok := rsOwners[ns+"/"+rsName] if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" { continue } return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true } } return startupWorkload{}, false } // eventObservationCount runs one orchestration or CLI step. // Signature: eventObservationCount(event eventResource) int. // Why: event count can live either on the root event or in the series payload; // using the max keeps detection stable across Kubernetes versions. func eventObservationCount(event eventResource) int { count := event.Count if event.Series.Count > count { count = event.Series.Count } if count < 1 { return 1 } return count } // eventLastObservedAt runs one orchestration or CLI step. // Signature: eventLastObservedAt(event eventResource) time.Time. // Why: event recency fields vary by cluster version; prefer the newest explicit // observation time and fall back to creation time when needed. func eventLastObservedAt(event eventResource) time.Time { switch { case !event.Series.LastObservedTime.IsZero(): return event.Series.LastObservedTime case !event.LastTimestamp.IsZero(): return event.LastTimestamp case !event.EventTime.IsZero(): return event.EventTime default: return event.Metadata.CreationTimestamp } }