diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index fbed073..0916969 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,7 +178,7 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = f"(( {NAMESPACE_GPU_RAW} ) or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -223,8 +223,9 @@ NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) NAMESPACE_GPU_RAW = ( - 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"} ' - 'or kube_pod_resource_request{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' + 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' + ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})' + ') by (namespace)' ) NAMESPACE_GPU_WEIGHT = ( "(( " diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index eb3f11d..f6d42c1 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index e7ddd48..bf7b710 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" }