snapshot: add namespace usage stats
This commit is contained in:
parent
bbc36d57c4
commit
558f5c1270
@ -445,6 +445,21 @@ def _vm_vector(expr: str) -> list[dict[str, Any]]:
|
||||
return output
|
||||
|
||||
|
||||
def _filter_namespace_vector(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
output: list[dict[str, Any]] = []
|
||||
for item in entries:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
|
||||
namespace = metric.get("namespace")
|
||||
if not isinstance(namespace, str) or not namespace:
|
||||
continue
|
||||
if namespace in _SYSTEM_NAMESPACES:
|
||||
continue
|
||||
output.append(item)
|
||||
return output
|
||||
|
||||
|
||||
def _vm_topk(expr: str, label_key: str) -> dict[str, Any] | None:
|
||||
result = _vm_vector(expr)
|
||||
if not result:
|
||||
@ -538,6 +553,24 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
||||
return usage
|
||||
|
||||
|
||||
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
|
||||
values: list[float] = []
|
||||
for entry in series:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
try:
|
||||
values.append(float(entry.get("value")))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if not values:
|
||||
return {}
|
||||
return {
|
||||
"min": min(values),
|
||||
"max": max(values),
|
||||
"avg": sum(values) / len(values),
|
||||
}
|
||||
|
||||
|
||||
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
metrics: dict[str, Any] = {}
|
||||
try:
|
||||
@ -557,12 +590,33 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
metrics["postgres_connections"] = _postgres_connections(errors)
|
||||
metrics["hottest_nodes"] = _hottest_nodes(errors)
|
||||
metrics["node_usage"] = _node_usage(errors)
|
||||
metrics["node_usage_stats"] = {
|
||||
"cpu": _usage_stats(metrics.get("node_usage", {}).get("cpu", [])),
|
||||
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
|
||||
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
|
||||
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
|
||||
}
|
||||
try:
|
||||
metrics["namespace_cpu_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
f'topk(5, sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
|
||||
)
|
||||
)
|
||||
metrics["namespace_mem_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
"topk(5, sum by (namespace) (container_memory_working_set_bytes{namespace!=\"\"}))"
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"namespace_usage: {exc}")
|
||||
metrics["units"] = {
|
||||
"cpu": "percent",
|
||||
"ram": "percent",
|
||||
"net": "bytes_per_sec",
|
||||
"io": "bytes_per_sec",
|
||||
"restarts": "count",
|
||||
"namespace_cpu": "cores",
|
||||
"namespace_mem": "bytes",
|
||||
}
|
||||
metrics["windows"] = {
|
||||
"rates": _RATE_WINDOW,
|
||||
|
||||
@ -82,6 +82,9 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
||||
assert snapshot["workloads"]
|
||||
assert snapshot["namespace_pods"]
|
||||
assert snapshot["namespace_pods"][0]["namespace"] == "media"
|
||||
assert "node_usage_stats" in snapshot["metrics"]
|
||||
assert snapshot["metrics"]["namespace_cpu_top"] == []
|
||||
assert snapshot["metrics"]["namespace_mem_top"] == []
|
||||
assert summary.nodes_total == 2
|
||||
assert summary.nodes_ready == 1
|
||||
assert summary.pods_running == 5.0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user