cluster: add lexicon and cross stats

2026-01-31 21:37:59 -03:00 · 2026-01-31 21:37:59 -03:00 · bdb94ffbe1
commit bdb94ffbe1
parent a5b35848d0
1 changed files with 137 additions and 0 deletions
--- a/ariadne/services/cluster_state.py
+++ b/ariadne/services/cluster_state.py
@ -90,6 +90,9 @@ _ALERT_TOP_LIMIT = 10
 _POD_REASON_LIMIT = 10
 _POD_REASON_TREND_LIMIT = 10
 _NAMESPACE_ISSUE_LIMIT = 8
+_CROSS_NODE_TOP = 3
+_CROSS_NAMESPACE_TOP = 3
+_CROSS_PVC_TOP = 3
 _POD_TERMINATED_REASONS = {
    "oom_killed": "OOMKilled",
    "error": "Error",
@ -2654,6 +2657,138 @@ def _events_summary(events: dict[str, Any]) -> dict[str, Any]:
    }


+def _build_lexicon() -> dict[str, Any]:
+    terms = [
+        {
+            "term": "hottest",
+            "meaning": "highest utilization for a metric (cpu, ram, net, io, load_index).",
+        },
+        {
+            "term": "pressure",
+            "meaning": "node condition flags (MemoryPressure, DiskPressure, PIDPressure, NetworkUnavailable).",
+        },
+        {
+            "term": "load_index",
+            "meaning": "composite load score derived from cpu, ram, net, io.",
+        },
+        {"term": "top", "meaning": "highest values within a category."},
+        {"term": "pods", "meaning": "running workload instances on a node or namespace."},
+        {"term": "workload", "meaning": "deployment/statefulset/daemonset grouping."},
+    ]
+    aliases = {
+        "hot node": "node with highest load_index",
+        "hottest by cpu": "node with highest cpu utilization",
+        "hottest by ram": "node with highest ram utilization",
+        "pressure node": "node with pressure condition flags",
+    }
+    return {"terms": terms, "aliases": aliases}
+
+
+def _top_named_entries(
+    entries: list[dict[str, Any]],
+    name_key: str,
+    limit: int,
+) -> list[dict[str, Any]]:
+    output: list[dict[str, Any]] = []
+    for entry in entries or []:
+        if not isinstance(entry, dict):
+            continue
+        name = entry.get(name_key)
+        if not isinstance(name, str) or not name:
+            continue
+        value = entry.get("value")
+        try:
+            numeric = float(value)
+        except (TypeError, ValueError):
+            numeric = 0.0
+        output.append({"name": name, "value": numeric})
+    output.sort(key=lambda item: -(item.get("value") or 0))
+    return output[:limit]
+
+
+def _cross_node_metric_top(metrics: dict[str, Any], node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
+    node_map = {entry.get("node"): entry for entry in node_context if isinstance(entry, dict)}
+    output: list[dict[str, Any]] = []
+    for metric in ("cpu", "ram", "net", "io", "disk"):
+        series = usage.get(metric)
+        if not isinstance(series, list):
+            continue
+        for top in _top_named_entries(series, "node", _CROSS_NODE_TOP):
+            node = top.get("name")
+            if not node:
+                continue
+            context = node_map.get(node, {})
+            output.append(
+                {
+                    "metric": metric,
+                    "node": node,
+                    "value": top.get("value"),
+                    "cpu": context.get("cpu"),
+                    "ram": context.get("ram"),
+                    "net": context.get("net"),
+                    "io": context.get("io"),
+                    "disk": context.get("disk"),
+                    "load_index": context.get("load_index"),
+                    "pods_total": context.get("pods_total"),
+                    "hardware": context.get("hardware"),
+                    "roles": context.get("roles"),
+                    "pressure_flags": context.get("pressure_flags"),
+                }
+            )
+    return output
+
+
+def _cross_namespace_metric_top(
+    metrics: dict[str, Any],
+    namespace_context: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    top = metrics.get("namespace_top") if isinstance(metrics.get("namespace_top"), dict) else {}
+    namespace_map = {
+        entry.get("namespace"): entry
+        for entry in namespace_context
+        if isinstance(entry, dict) and entry.get("namespace")
+    }
+    output: list[dict[str, Any]] = []
+    for metric in ("cpu", "mem", "net", "io", "restarts"):
+        series = top.get(metric)
+        if not isinstance(series, list):
+            continue
+        for entry in _top_named_entries(series, "namespace", _CROSS_NAMESPACE_TOP):
+            namespace = entry.get("name")
+            if not namespace:
+                continue
+            context = namespace_map.get(namespace, {})
+            output.append(
+                {
+                    "metric": metric,
+                    "namespace": namespace,
+                    "value": entry.get("value"),
+                    "pods_total": context.get("pods_total"),
+                    "pods_running": context.get("pods_running"),
+                    "cpu_ratio": context.get("cpu_ratio"),
+                    "mem_ratio": context.get("mem_ratio"),
+                    "primary_node": context.get("primary_node"),
+                    "nodes_top": context.get("nodes_top") or [],
+                }
+            )
+    return output
+
+
+def _build_cross_stats(
+    metrics: dict[str, Any],
+    node_context: list[dict[str, Any]],
+    namespace_context: list[dict[str, Any]],
+    workloads: list[dict[str, Any]],
+) -> dict[str, Any]:
+    return {
+        "node_metric_top": _cross_node_metric_top(metrics, node_context),
+        "namespace_metric_top": _cross_namespace_metric_top(metrics, namespace_context),
+        "pvc_top": _pvc_top(metrics.get("pvc_usage_top", []))[:_CROSS_PVC_TOP],
+        "workload_top": _workload_nodes_top(workloads, _CROSS_NAMESPACE_TOP),
+    }
+
+
 def _node_context(
    node_details: list[dict[str, Any]],
    node_load: list[dict[str, Any]],
@ -3444,6 +3579,8 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
        "anomalies": anomalies,
        "health_bullets": _health_bullets(metrics, node_summary, workload_health, anomalies),
        "events": _events_summary(events),
+        "lexicon": _build_lexicon(),
+        "cross_stats": _build_cross_stats(metrics, node_context, namespace_context, workloads),
        "unknowns": errors,
    }