atlasbot: add pod resource top metrics

This commit is contained in:
Brad Stein 2026-01-29 02:57:59 -03:00
parent 2370aa4e5d
commit 281118b810
2 changed files with 14 additions and 0 deletions

View File

@ -1073,6 +1073,16 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics["top_restarts_1h"] = _vm_vector(
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
metrics["pod_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
)
)
metrics["pod_mem_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
except Exception as exc:
errors.append(f"vm: {exc}")
metrics["postgres_connections"] = _postgres_connections(errors)
@ -1106,6 +1116,8 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
"io": "bytes_per_sec",
"disk": "percent",
"restarts": "count",
"pod_cpu": "cores",
"pod_mem": "bytes",
"namespace_cpu": "cores",
"namespace_mem": "bytes",
"pvc_used_percent": "percent",

View File

@ -124,6 +124,8 @@ def test_collect_cluster_state(monkeypatch) -> None:
assert "node_usage_stats" in snapshot["metrics"]
assert snapshot["metrics"]["namespace_cpu_top"] == []
assert snapshot["metrics"]["namespace_mem_top"] == []
assert snapshot["metrics"]["pod_cpu_top"] == []
assert snapshot["metrics"]["pod_mem_top"] == []
assert snapshot["metrics"]["pvc_usage_top"] == []
assert summary.nodes_total == 2
assert summary.nodes_ready == 1