ariadne/ariadne/services/cluster_state_contract.py

122 lines
2.8 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from typing import Any
_VALUE_PAIR_LEN = 2
_RATE_WINDOW = "5m"
_RESTARTS_WINDOW = "1h"
_BASELINE_WINDOW = "24h"
_TREND_WINDOWS = ("1h", "6h", "24h")
_TREND_NODE_LIMIT = 30
_TREND_NAMESPACE_LIMIT = 20
_TREND_PVC_LIMIT = 10
_TREND_JOB_LIMIT = 10
_TREND_POD_LIMIT = 15
_NODE_DISK_ALERT = 80.0
_NODE_CPU_ALERT = 80.0
_NODE_RAM_ALERT = 80.0
_NET_SPIKE_MULTIPLIER = 2.0
_IO_SPIKE_MULTIPLIER = 2.0
_NODE_UNAME_LABEL = 'node_uname_info{nodename!=""}'
_WORKLOAD_LABEL_KEYS = (
"app.kubernetes.io/name",
"app",
"k8s-app",
"app.kubernetes.io/instance",
"release",
)
_SYSTEM_NAMESPACES = {
"kube-system",
"kube-public",
"kube-node-lease",
"flux-system",
"monitoring",
"logging",
"traefik",
"cert-manager",
"maintenance",
"postgres",
"vault",
}
_WORKLOAD_ALLOWED_NAMESPACES = {
"maintenance",
}
_BASELINE_DELTA_WARN = 50.0
_BASELINE_DELTA_CRIT = 100.0
_SIGNAL_LIMIT = 15
_PROFILE_LIMIT = 6
_WORKLOAD_INDEX_LIMIT = 20
_NODE_WORKLOAD_LIMIT = 12
_NODE_WORKLOAD_TOP = 3
_EVENTS_SUMMARY_LIMIT = 5
_PVC_CRITICAL_THRESHOLD = 90.0
_CAPACITY_KEYS = {
"cpu",
"memory",
"pods",
"ephemeral-storage",
}
_PRESSURE_TYPES = {
"MemoryPressure",
"DiskPressure",
"PIDPressure",
"NetworkUnavailable",
}
_EVENTS_MAX = 20
_EVENT_WARNING = "Warning"
_PHASE_SEVERITY = {
"Failed": 3,
"Pending": 2,
"Unknown": 1,
}
_PENDING_15M_HOURS = 0.25
_LOAD_TOP_COUNT = 5
_NAMESPACE_TOP_COUNT = 5
_PVC_PRESSURE_THRESHOLD = 80.0
_ALERT_TOP_LIMIT = 10
_POD_REASON_LIMIT = 10
_POD_REASON_TREND_LIMIT = 10
_NAMESPACE_ISSUE_LIMIT = 8
_CROSS_NODE_TOP = 3
_CROSS_NAMESPACE_TOP = 3
_CROSS_PVC_TOP = 3
_POD_TERMINATED_REASONS = {
"oom_killed": "OOMKilled",
"error": "Error",
}
_POD_WAITING_REASONS = {
"crash_loop": "CrashLoopBackOff",
"image_pull_backoff": "ImagePullBackOff",
"err_image_pull": "ErrImagePull",
"create_config_error": "CreateContainerConfigError",
}
_DELTA_TOP_LIMIT = 6
_REASON_TOP_LIMIT = 5
@dataclass(frozen=True)
class ClusterStateSummary:
nodes_total: int | None
nodes_ready: int | None
pods_running: int | None
kustomizations_not_ready: int | None
errors: int
@dataclass(frozen=True)
class SignalContext:
metrics: dict[str, Any]
node_context: list[dict[str, Any]]
namespace_context: list[dict[str, Any]]
workloads_health: dict[str, Any]
pod_issues: dict[str, Any]
kustomizations: dict[str, Any]
def _items(payload: dict[str, Any]) -> list[dict[str, Any]]:
items = payload.get("items") if isinstance(payload.get("items"), list) else []
return [item for item in items if isinstance(item, dict)]
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]