diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index 8f75cc4..127aaa9 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -445,6 +445,21 @@ def _vm_vector(expr: str) -> list[dict[str, Any]]: return output +def _filter_namespace_vector(entries: list[dict[str, Any]]) -> list[dict[str, Any]]: + output: list[dict[str, Any]] = [] + for item in entries: + if not isinstance(item, dict): + continue + metric = item.get("metric") if isinstance(item.get("metric"), dict) else {} + namespace = metric.get("namespace") + if not isinstance(namespace, str) or not namespace: + continue + if namespace in _SYSTEM_NAMESPACES: + continue + output.append(item) + return output + + def _vm_topk(expr: str, label_key: str) -> dict[str, Any] | None: result = _vm_vector(expr) if not result: @@ -538,6 +553,24 @@ def _node_usage(errors: list[str]) -> dict[str, Any]: return usage +def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]: + values: list[float] = [] + for entry in series: + if not isinstance(entry, dict): + continue + try: + values.append(float(entry.get("value"))) + except (TypeError, ValueError): + continue + if not values: + return {} + return { + "min": min(values), + "max": max(values), + "avg": sum(values) / len(values), + } + + def _summarize_metrics(errors: list[str]) -> dict[str, Any]: metrics: dict[str, Any] = {} try: @@ -557,12 +590,33 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: metrics["postgres_connections"] = _postgres_connections(errors) metrics["hottest_nodes"] = _hottest_nodes(errors) metrics["node_usage"] = _node_usage(errors) + metrics["node_usage_stats"] = { + "cpu": _usage_stats(metrics.get("node_usage", {}).get("cpu", [])), + "ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])), + "net": _usage_stats(metrics.get("node_usage", {}).get("net", [])), + "io": _usage_stats(metrics.get("node_usage", {}).get("io", [])), + } + try: + metrics["namespace_cpu_top"] = _filter_namespace_vector( + _vm_vector( + f'topk(5, sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))' + ) + ) + metrics["namespace_mem_top"] = _filter_namespace_vector( + _vm_vector( + "topk(5, sum by (namespace) (container_memory_working_set_bytes{namespace!=\"\"}))" + ) + ) + except Exception as exc: + errors.append(f"namespace_usage: {exc}") metrics["units"] = { "cpu": "percent", "ram": "percent", "net": "bytes_per_sec", "io": "bytes_per_sec", "restarts": "count", + "namespace_cpu": "cores", + "namespace_mem": "bytes", } metrics["windows"] = { "rates": _RATE_WINDOW, diff --git a/tests/test_cluster_state.py b/tests/test_cluster_state.py index 86bcc18..4877a61 100644 --- a/tests/test_cluster_state.py +++ b/tests/test_cluster_state.py @@ -82,6 +82,9 @@ def test_collect_cluster_state(monkeypatch) -> None: assert snapshot["workloads"] assert snapshot["namespace_pods"] assert snapshot["namespace_pods"][0]["namespace"] == "media" + assert "node_usage_stats" in snapshot["metrics"] + assert snapshot["metrics"]["namespace_cpu_top"] == [] + assert snapshot["metrics"]["namespace_mem_top"] == [] assert summary.nodes_total == 2 assert summary.nodes_ready == 1 assert summary.pods_running == 5.0