snapshot: add load and namespace summaries

2026-01-29 13:32:38 -03:00 · 2026-01-29 13:32:38 -03:00 · ccf89bc2e7
commit ccf89bc2e7
parent fc2a482df1
1 changed files with 105 additions and 0 deletions
--- a/ariadne/services/cluster_state.py
+++ b/ariadne/services/cluster_state.py
@ -62,6 +62,8 @@ _PHASE_SEVERITY = {
    "Unknown": 1,
 }
 _PENDING_15M_HOURS = 0.25
+_LOAD_TOP_COUNT = 5
+_NAMESPACE_TOP_COUNT = 5


@dataclass(frozen=True)
@ -1490,6 +1492,105 @@ def _node_usage_profile(
    return output


+def _percentile(values: list[float], percentile: float) -> float | None:
+    if not values:
+        return None
+    ordered = sorted(values)
+    idx = int(round((len(ordered) - 1) * percentile))
+    idx = min(max(idx, 0), len(ordered) - 1)
+    return ordered[idx]
+
+
+def _node_load_summary(node_load: list[dict[str, Any]]) -> dict[str, Any]:
+    items = [
+        entry
+        for entry in node_load
+        if isinstance(entry, dict) and isinstance(entry.get("load_index"), (int, float))
+    ]
+    if not items:
+        return {}
+    values = [float(entry.get("load_index") or 0) for entry in items]
+    avg = sum(values) / len(values)
+    variance = sum((value - avg) ** 2 for value in values) / len(values)
+    stddev = variance**0.5
+    top = sorted(items, key=lambda item: -(item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
+    bottom = sorted(items, key=lambda item: (item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
+    outliers = [
+        item
+        for item in items
+        if isinstance(item.get("load_index"), (int, float))
+        and item.get("load_index") >= avg + stddev
+    ]
+    outliers.sort(key=lambda item: -(item.get("load_index") or 0))
+    return {
+        "avg": round(avg, 3),
+        "p90": round(_percentile(values, 0.9) or 0.0, 3),
+        "min": round(min(values), 3),
+        "max": round(max(values), 3),
+        "top": top,
+        "bottom": bottom,
+        "outliers": outliers[:_LOAD_TOP_COUNT],
+    }
+
+
+def _namespace_capacity_summary(capacity: list[dict[str, Any]]) -> dict[str, Any]:
+    if not capacity:
+        return {}
+    cpu_ratio = [
+        entry
+        for entry in capacity
+        if isinstance(entry, dict) and isinstance(entry.get("cpu_usage_ratio"), (int, float))
+    ]
+    mem_ratio = [
+        entry
+        for entry in capacity
+        if isinstance(entry, dict) and isinstance(entry.get("mem_usage_ratio"), (int, float))
+    ]
+    cpu_ratio.sort(key=lambda item: -(item.get("cpu_usage_ratio") or 0))
+    mem_ratio.sort(key=lambda item: -(item.get("mem_usage_ratio") or 0))
+    cpu_headroom: list[dict[str, Any]] = []
+    mem_headroom: list[dict[str, Any]] = []
+    for entry in capacity:
+        if not isinstance(entry, dict):
+            continue
+        cpu_used = entry.get("cpu_usage")
+        cpu_req = entry.get("cpu_requests")
+        mem_used = entry.get("mem_usage")
+        mem_req = entry.get("mem_requests")
+        if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)):
+            cpu_headroom.append(
+                {
+                    "namespace": entry.get("namespace"),
+                    "headroom": cpu_req - cpu_used,
+                    "usage": cpu_used,
+                    "requests": cpu_req,
+                    "ratio": entry.get("cpu_usage_ratio"),
+                }
+            )
+        if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)):
+            mem_headroom.append(
+                {
+                    "namespace": entry.get("namespace"),
+                    "headroom": mem_req - mem_used,
+                    "usage": mem_used,
+                    "requests": mem_req,
+                    "ratio": entry.get("mem_usage_ratio"),
+                }
+            )
+    cpu_headroom.sort(key=lambda item: (item.get("headroom") or 0))
+    mem_headroom.sort(key=lambda item: (item.get("headroom") or 0))
+    over_cpu = sum(1 for entry in cpu_ratio if (entry.get("cpu_usage_ratio") or 0) > 1)
+    over_mem = sum(1 for entry in mem_ratio if (entry.get("mem_usage_ratio") or 0) > 1)
+    return {
+        "cpu_ratio_top": cpu_ratio[:_NAMESPACE_TOP_COUNT],
+        "mem_ratio_top": mem_ratio[:_NAMESPACE_TOP_COUNT],
+        "cpu_headroom_low": cpu_headroom[:_NAMESPACE_TOP_COUNT],
+        "mem_headroom_low": mem_headroom[:_NAMESPACE_TOP_COUNT],
+        "cpu_overcommitted": over_cpu,
+        "mem_overcommitted": over_mem,
+    }
+
+
 def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
    metrics: dict[str, Any] = {}
    try:
@ -1601,6 +1702,9 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
        )
    except Exception as exc:
        errors.append(f"namespace_usage: {exc}")
+    metrics["namespace_capacity_summary"] = _namespace_capacity_summary(
+        metrics.get("namespace_capacity", []),
+    )
    metrics["pvc_usage_top"] = _pvc_usage(errors)
    metrics["units"] = {
        "cpu": "percent",
@ -1654,6 +1758,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
        node_details,
        node_pods,
    )
+    metrics["node_load_summary"] = _node_load_summary(metrics.get("node_load", []))

    snapshot = {
        "collected_at": collected_at.isoformat(),