From 281118b810c6ffe0ea3b1b7209dbe229070e098c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 29 Jan 2026 02:57:59 -0300 Subject: [PATCH] atlasbot: add pod resource top metrics --- ariadne/services/cluster_state.py | 12 ++++++++++++ tests/test_cluster_state.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 4223cce..9f9c0ce 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -1073,6 +1073,16 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: metrics["top_restarts_1h"] = _vm_vector( f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))" ) + metrics["pod_cpu_top"] = _filter_namespace_vector( + _vm_vector( + f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))' + ) + ) + metrics["pod_mem_top"] = _filter_namespace_vector( + _vm_vector( + "topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))" + ) + ) except Exception as exc: errors.append(f"vm: {exc}") metrics["postgres_connections"] = _postgres_connections(errors) @@ -1106,6 +1116,8 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: "io": "bytes_per_sec", "disk": "percent", "restarts": "count", + "pod_cpu": "cores", + "pod_mem": "bytes", "namespace_cpu": "cores", "namespace_mem": "bytes", "pvc_used_percent": "percent", diff --git a/tests/test_cluster_state.py b/tests/test_cluster_state.py index b0e0780..42f5a4a 100644 --- a/tests/test_cluster_state.py +++ b/tests/test_cluster_state.py @@ -124,6 +124,8 @@ def test_collect_cluster_state(monkeypatch) -> None: assert "node_usage_stats" in snapshot["metrics"] assert snapshot["metrics"]["namespace_cpu_top"] == [] assert snapshot["metrics"]["namespace_mem_top"] == [] + assert snapshot["metrics"]["pod_cpu_top"] == [] + assert snapshot["metrics"]["pod_mem_top"] == [] assert snapshot["metrics"]["pvc_usage_top"] == [] assert summary.nodes_total == 2 assert summary.nodes_ready == 1