ananke/internal/cluster/orchestrator_scheduling_storm.go

262 lines
8.6 KiB
Go

package cluster
import (
"context"
"encoding/json"
"fmt"
"sort"
"strings"
"time"
)
// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
// Why: a non-core workload that cannot schedule can emit enough warning events to
// thrash the control plane datastore; quarantine keeps startup moving while
// preserving core services.
func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
return
}
now := time.Now()
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
return
}
if lastAttempt != nil {
*lastAttempt = now
}
o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
return o.quarantineSchedulingStormWorkloads(ctx)
})
}
// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
// Why: limits startup-only mitigation to workloads proven to be generating a
// scheduling event storm, instead of scaling optional apps down blindly.
func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query pods for scheduling storm scan: %w", err)
}
var pods podList
if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
}
rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
}
var rsList replicaSetList
if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
}
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query events for scheduling storm scan: %w", err)
}
var events eventList
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
return fmt.Errorf("decode events for scheduling storm scan: %w", err)
}
workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
if err != nil {
return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
}
var workloads workloadList
if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
}
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
if eventThreshold <= 0 {
eventThreshold = 30
}
window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
if window <= 0 {
window = 3 * time.Minute
}
podsByKey := map[string]podResource{}
for _, pod := range pods.Items {
ns := strings.TrimSpace(pod.Metadata.Namespace)
name := strings.TrimSpace(pod.Metadata.Name)
if ns == "" || name == "" {
continue
}
podsByKey[ns+"/"+name] = pod
}
rsOwners := map[string]ownerReference{}
for _, rs := range rsList.Items {
ns := strings.TrimSpace(rs.Metadata.Namespace)
name := strings.TrimSpace(rs.Metadata.Name)
if ns == "" || name == "" {
continue
}
for _, owner := range rs.Metadata.OwnerReferences {
kind := strings.TrimSpace(owner.Kind)
ownerName := strings.TrimSpace(owner.Name)
if kind == "" || ownerName == "" {
continue
}
rsOwners[ns+"/"+name] = owner
break
}
}
workloadDesired := map[string]int32{}
for _, item := range workloads.Items {
kind := strings.ToLower(strings.TrimSpace(item.Kind))
ns := strings.TrimSpace(item.Metadata.Namespace)
name := strings.TrimSpace(item.Metadata.Name)
if kind == "" || ns == "" || name == "" {
continue
}
desired, _, ok := desiredReady(item)
if !ok {
continue
}
workloadDesired[ns+"/"+kind+"/"+name] = desired
}
quarantined := []string{}
seen := map[string]struct{}{}
now := time.Now()
for _, event := range events.Items {
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
continue
}
if strings.TrimSpace(event.Reason) != "FailedScheduling" {
continue
}
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
continue
}
lastSeen := eventLastObservedAt(event)
if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
continue
}
count := eventObservationCount(event)
if count < eventThreshold {
continue
}
podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
pod, ok := podsByKey[podKey]
if !ok {
continue
}
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
continue
}
ns := strings.TrimSpace(pod.Metadata.Namespace)
if _, ok := requiredNamespaces[ns]; ok {
continue
}
if _, ok := ignoredNamespaces[ns]; ok {
continue
}
if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
continue
}
if podTargetsIgnoredNode(pod, ignoredNodes) {
continue
}
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
if !ok {
continue
}
if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
continue
}
workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
if _, done := seen[workloadKey]; done {
continue
}
desired := workloadDesired[workloadKey]
if desired <= 0 {
continue
}
if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
}
seen[workloadKey] = struct{}{}
quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
}
if len(quarantined) == 0 {
return nil
}
sort.Strings(quarantined)
detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
o.log.Printf("%s", detail)
o.noteStartupAutoHeal(detail)
return nil
}
// schedulingStormOwnerWorkload runs one orchestration or CLI step.
// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
// operate on the owning deployment or statefulset.
func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
ns := strings.TrimSpace(pod.Metadata.Namespace)
for _, owner := range pod.Metadata.OwnerReferences {
switch strings.TrimSpace(owner.Kind) {
case "StatefulSet":
if name := strings.TrimSpace(owner.Name); name != "" {
return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
}
case "ReplicaSet":
rsName := strings.TrimSpace(owner.Name)
if rsName == "" {
continue
}
rsOwner, ok := rsOwners[ns+"/"+rsName]
if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
continue
}
return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
}
}
return startupWorkload{}, false
}
// eventObservationCount runs one orchestration or CLI step.
// Signature: eventObservationCount(event eventResource) int.
// Why: event count can live either on the root event or in the series payload;
// using the max keeps detection stable across Kubernetes versions.
func eventObservationCount(event eventResource) int {
count := event.Count
if event.Series.Count > count {
count = event.Series.Count
}
if count < 1 {
return 1
}
return count
}
// eventLastObservedAt runs one orchestration or CLI step.
// Signature: eventLastObservedAt(event eventResource) time.Time.
// Why: event recency fields vary by cluster version; prefer the newest explicit
// observation time and fall back to creation time when needed.
func eventLastObservedAt(event eventResource) time.Time {
switch {
case !event.Series.LastObservedTime.IsZero():
return event.Series.LastObservedTime
case !event.LastTimestamp.IsZero():
return event.LastTimestamp
case !event.EventTime.IsZero():
return event.EventTime
default:
return event.Metadata.CreationTimestamp
}
}