262 lines
8.6 KiB
Go
262 lines
8.6 KiB
Go
package cluster
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// maybeAutoQuarantineSchedulingStorms runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time).
|
|
// Why: a non-core workload that cannot schedule can emit enough warning events to
|
|
// thrash the control plane datastore; quarantine keeps startup moving while
|
|
// preserving core services.
|
|
func (o *Orchestrator) maybeAutoQuarantineSchedulingStorms(ctx context.Context, lastAttempt *time.Time) {
|
|
if o.runner.DryRun || !o.cfg.Startup.AutoQuarantineSchedulingStorms {
|
|
return
|
|
}
|
|
now := time.Now()
|
|
if lastAttempt != nil && !lastAttempt.IsZero() && now.Sub(*lastAttempt) < 30*time.Second {
|
|
return
|
|
}
|
|
if lastAttempt != nil {
|
|
*lastAttempt = now
|
|
}
|
|
o.bestEffort("quarantine non-core scheduling storm workloads", func() error {
|
|
return o.quarantineSchedulingStormWorkloads(ctx)
|
|
})
|
|
}
|
|
|
|
// quarantineSchedulingStormWorkloads runs one orchestration or CLI step.
|
|
// Signature: (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error.
|
|
// Why: limits startup-only mitigation to workloads proven to be generating a
|
|
// scheduling event storm, instead of scaling optional apps down blindly.
|
|
func (o *Orchestrator) quarantineSchedulingStormWorkloads(ctx context.Context) error {
|
|
podsOut, err := o.kubectl(ctx, 30*time.Second, "get", "pods", "-A", "-o", "json")
|
|
if err != nil {
|
|
return fmt.Errorf("query pods for scheduling storm scan: %w", err)
|
|
}
|
|
var pods podList
|
|
if err := json.Unmarshal([]byte(podsOut), &pods); err != nil {
|
|
return fmt.Errorf("decode pods for scheduling storm scan: %w", err)
|
|
}
|
|
|
|
rsOut, err := o.kubectl(ctx, 30*time.Second, "get", "replicasets", "-A", "-o", "json")
|
|
if err != nil {
|
|
return fmt.Errorf("query replicasets for scheduling storm scan: %w", err)
|
|
}
|
|
var rsList replicaSetList
|
|
if err := json.Unmarshal([]byte(rsOut), &rsList); err != nil {
|
|
return fmt.Errorf("decode replicasets for scheduling storm scan: %w", err)
|
|
}
|
|
|
|
eventsOut, err := o.kubectl(ctx, 30*time.Second, "get", "events", "-A", "-o", "json")
|
|
if err != nil {
|
|
return fmt.Errorf("query events for scheduling storm scan: %w", err)
|
|
}
|
|
var events eventList
|
|
if err := json.Unmarshal([]byte(eventsOut), &events); err != nil {
|
|
return fmt.Errorf("decode events for scheduling storm scan: %w", err)
|
|
}
|
|
|
|
workloadsOut, err := o.kubectl(ctx, 30*time.Second, "get", "deploy,statefulset", "-A", "-o", "json")
|
|
if err != nil {
|
|
return fmt.Errorf("query workloads for scheduling storm scan: %w", err)
|
|
}
|
|
var workloads workloadList
|
|
if err := json.Unmarshal([]byte(workloadsOut), &workloads); err != nil {
|
|
return fmt.Errorf("decode workloads for scheduling storm scan: %w", err)
|
|
}
|
|
|
|
requiredNamespaces := o.startupRequiredWorkloadNamespaces()
|
|
ignoredNamespaces := makeStringSet(o.cfg.Startup.IgnoreWorkloadNamespaces)
|
|
ignoredNodes := makeStringSet(o.cfg.Startup.IgnoreUnavailableNodes)
|
|
ignoreRules := parseWorkloadIgnoreRules(o.cfg.Startup.IgnoreWorkloads)
|
|
eventThreshold := o.cfg.Startup.SchedulingStormEventThreshold
|
|
if eventThreshold <= 0 {
|
|
eventThreshold = 30
|
|
}
|
|
window := time.Duration(o.cfg.Startup.SchedulingStormWindowSeconds) * time.Second
|
|
if window <= 0 {
|
|
window = 3 * time.Minute
|
|
}
|
|
|
|
podsByKey := map[string]podResource{}
|
|
for _, pod := range pods.Items {
|
|
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
|
name := strings.TrimSpace(pod.Metadata.Name)
|
|
if ns == "" || name == "" {
|
|
continue
|
|
}
|
|
podsByKey[ns+"/"+name] = pod
|
|
}
|
|
|
|
rsOwners := map[string]ownerReference{}
|
|
for _, rs := range rsList.Items {
|
|
ns := strings.TrimSpace(rs.Metadata.Namespace)
|
|
name := strings.TrimSpace(rs.Metadata.Name)
|
|
if ns == "" || name == "" {
|
|
continue
|
|
}
|
|
for _, owner := range rs.Metadata.OwnerReferences {
|
|
kind := strings.TrimSpace(owner.Kind)
|
|
ownerName := strings.TrimSpace(owner.Name)
|
|
if kind == "" || ownerName == "" {
|
|
continue
|
|
}
|
|
rsOwners[ns+"/"+name] = owner
|
|
break
|
|
}
|
|
}
|
|
|
|
workloadDesired := map[string]int32{}
|
|
for _, item := range workloads.Items {
|
|
kind := strings.ToLower(strings.TrimSpace(item.Kind))
|
|
ns := strings.TrimSpace(item.Metadata.Namespace)
|
|
name := strings.TrimSpace(item.Metadata.Name)
|
|
if kind == "" || ns == "" || name == "" {
|
|
continue
|
|
}
|
|
desired, _, ok := desiredReady(item)
|
|
if !ok {
|
|
continue
|
|
}
|
|
workloadDesired[ns+"/"+kind+"/"+name] = desired
|
|
}
|
|
|
|
quarantined := []string{}
|
|
seen := map[string]struct{}{}
|
|
now := time.Now()
|
|
for _, event := range events.Items {
|
|
if !strings.EqualFold(strings.TrimSpace(event.Type), "Warning") {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(event.Reason) != "FailedScheduling" {
|
|
continue
|
|
}
|
|
if !strings.EqualFold(strings.TrimSpace(event.InvolvedObject.Kind), "Pod") {
|
|
continue
|
|
}
|
|
lastSeen := eventLastObservedAt(event)
|
|
if !lastSeen.IsZero() && now.Sub(lastSeen) > window {
|
|
continue
|
|
}
|
|
count := eventObservationCount(event)
|
|
if count < eventThreshold {
|
|
continue
|
|
}
|
|
podKey := strings.TrimSpace(event.InvolvedObject.Namespace) + "/" + strings.TrimSpace(event.InvolvedObject.Name)
|
|
pod, ok := podsByKey[podKey]
|
|
if !ok {
|
|
continue
|
|
}
|
|
if !strings.EqualFold(strings.TrimSpace(pod.Status.Phase), "Pending") {
|
|
continue
|
|
}
|
|
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
|
if _, ok := requiredNamespaces[ns]; ok {
|
|
continue
|
|
}
|
|
if _, ok := ignoredNamespaces[ns]; ok {
|
|
continue
|
|
}
|
|
if workloadIgnored(ignoreRules, ns, "", strings.TrimSpace(pod.Metadata.Name)) {
|
|
continue
|
|
}
|
|
if podTargetsIgnoredNode(pod, ignoredNodes) {
|
|
continue
|
|
}
|
|
workload, ok := schedulingStormOwnerWorkload(pod, rsOwners)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if workloadIgnored(ignoreRules, workload.Namespace, workload.Kind, workload.Name) {
|
|
continue
|
|
}
|
|
workloadKey := workload.Namespace + "/" + workload.Kind + "/" + workload.Name
|
|
if _, done := seen[workloadKey]; done {
|
|
continue
|
|
}
|
|
desired := workloadDesired[workloadKey]
|
|
if desired <= 0 {
|
|
continue
|
|
}
|
|
if err := o.ensureWorkloadReplicas(ctx, workload, 0); err != nil {
|
|
return fmt.Errorf("scale scheduling storm workload %s to 0: %w", workloadKey, err)
|
|
}
|
|
seen[workloadKey] = struct{}{}
|
|
quarantined = append(quarantined, fmt.Sprintf("%s events=%d window=%s", workloadKey, count, window))
|
|
}
|
|
|
|
if len(quarantined) == 0 {
|
|
return nil
|
|
}
|
|
sort.Strings(quarantined)
|
|
detail := fmt.Sprintf("quarantined scheduling storm workload(s): %s", joinLimited(quarantined, 8))
|
|
o.log.Printf("%s", detail)
|
|
o.noteStartupAutoHeal(detail)
|
|
return nil
|
|
}
|
|
|
|
// schedulingStormOwnerWorkload runs one orchestration or CLI step.
|
|
// Signature: schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool).
|
|
// Why: scheduling storms happen at the pod layer, but safe mitigation needs to
|
|
// operate on the owning deployment or statefulset.
|
|
func schedulingStormOwnerWorkload(pod podResource, rsOwners map[string]ownerReference) (startupWorkload, bool) {
|
|
ns := strings.TrimSpace(pod.Metadata.Namespace)
|
|
for _, owner := range pod.Metadata.OwnerReferences {
|
|
switch strings.TrimSpace(owner.Kind) {
|
|
case "StatefulSet":
|
|
if name := strings.TrimSpace(owner.Name); name != "" {
|
|
return startupWorkload{Namespace: ns, Kind: "statefulset", Name: name}, true
|
|
}
|
|
case "ReplicaSet":
|
|
rsName := strings.TrimSpace(owner.Name)
|
|
if rsName == "" {
|
|
continue
|
|
}
|
|
rsOwner, ok := rsOwners[ns+"/"+rsName]
|
|
if !ok || strings.TrimSpace(rsOwner.Kind) != "Deployment" || strings.TrimSpace(rsOwner.Name) == "" {
|
|
continue
|
|
}
|
|
return startupWorkload{Namespace: ns, Kind: "deployment", Name: strings.TrimSpace(rsOwner.Name)}, true
|
|
}
|
|
}
|
|
return startupWorkload{}, false
|
|
}
|
|
|
|
// eventObservationCount runs one orchestration or CLI step.
|
|
// Signature: eventObservationCount(event eventResource) int.
|
|
// Why: event count can live either on the root event or in the series payload;
|
|
// using the max keeps detection stable across Kubernetes versions.
|
|
func eventObservationCount(event eventResource) int {
|
|
count := event.Count
|
|
if event.Series.Count > count {
|
|
count = event.Series.Count
|
|
}
|
|
if count < 1 {
|
|
return 1
|
|
}
|
|
return count
|
|
}
|
|
|
|
// eventLastObservedAt runs one orchestration or CLI step.
|
|
// Signature: eventLastObservedAt(event eventResource) time.Time.
|
|
// Why: event recency fields vary by cluster version; prefer the newest explicit
|
|
// observation time and fall back to creation time when needed.
|
|
func eventLastObservedAt(event eventResource) time.Time {
|
|
switch {
|
|
case !event.Series.LastObservedTime.IsZero():
|
|
return event.Series.LastObservedTime
|
|
case !event.LastTimestamp.IsZero():
|
|
return event.LastTimestamp
|
|
case !event.EventTime.IsZero():
|
|
return event.EventTime
|
|
default:
|
|
return event.Metadata.CreationTimestamp
|
|
}
|
|
}
|