diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index a720f23..ba03392 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -1373,11 +1373,21 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))' ) ) + metrics["pod_cpu_top_node"] = _filter_namespace_vector( + _vm_vector( + f'topk(5, sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]) * on (namespace,pod) group_left(node) kube_pod_info))' + ) + ) metrics["pod_mem_top"] = _filter_namespace_vector( _vm_vector( "topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))" ) ) + metrics["pod_mem_top_node"] = _filter_namespace_vector( + _vm_vector( + "topk(5, sum by (node,namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"} * on (namespace,pod) group_left(node) kube_pod_info))" + ) + ) metrics["job_failures_24h"] = _vm_vector( "topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))" ) @@ -1436,6 +1446,8 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]: "restarts": "count", "pod_cpu": "cores", "pod_mem": "bytes", + "pod_cpu_top_node": "cores", + "pod_mem_top_node": "bytes", "job_failures_24h": "count", "namespace_cpu": "cores", "namespace_mem": "bytes",