snapshot: add namespace usage stats

This commit is contained in:
Brad Stein 2026-01-28 20:41:17 -03:00
parent bbc36d57c4
commit 558f5c1270
2 changed files with 57 additions and 0 deletions

View File

@ -445,6 +445,21 @@ def _vm_vector(expr: str) -> list[dict[str, Any]]:
return output
def _filter_namespace_vector(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
if namespace in _SYSTEM_NAMESPACES:
continue
output.append(item)
return output
def _vm_topk(expr: str, label_key: str) -> dict[str, Any] | None:
result = _vm_vector(expr)
if not result:
@ -538,6 +553,24 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
return usage
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
values: list[float] = []
for entry in series:
if not isinstance(entry, dict):
continue
try:
values.append(float(entry.get("value")))
except (TypeError, ValueError):
continue
if not values:
return {}
return {
"min": min(values),
"max": max(values),
"avg": sum(values) / len(values),
}
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics: dict[str, Any] = {}
try:
@ -557,12 +590,33 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics["postgres_connections"] = _postgres_connections(errors)
metrics["hottest_nodes"] = _hottest_nodes(errors)
metrics["node_usage"] = _node_usage(errors)
metrics["node_usage_stats"] = {
"cpu": _usage_stats(metrics.get("node_usage", {}).get("cpu", [])),
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
}
try:
metrics["namespace_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
)
)
metrics["namespace_mem_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
except Exception as exc:
errors.append(f"namespace_usage: {exc}")
metrics["units"] = {
"cpu": "percent",
"ram": "percent",
"net": "bytes_per_sec",
"io": "bytes_per_sec",
"restarts": "count",
"namespace_cpu": "cores",
"namespace_mem": "bytes",
}
metrics["windows"] = {
"rates": _RATE_WINDOW,

View File

@ -82,6 +82,9 @@ def test_collect_cluster_state(monkeypatch) -> None:
assert snapshot["workloads"]
assert snapshot["namespace_pods"]
assert snapshot["namespace_pods"][0]["namespace"] == "media"
assert "node_usage_stats" in snapshot["metrics"]
assert snapshot["metrics"]["namespace_cpu_top"] == []
assert snapshot["metrics"]["namespace_mem_top"] == []
assert summary.nodes_total == 2
assert summary.nodes_ready == 1
assert summary.pods_running == 5.0