diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 1248984..1c4aef2 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -167,17 +167,20 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" def namespace_gpu_share_expr(): selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -222,12 +225,17 @@ NAMESPACE_RAM_RAW = ( NAMESPACE_GPU_RAW = ( 'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' ) +NAMESPACE_GPU_RAW = ( + 'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)' +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9))' + + ' / 1e9) + (' + + NAMESPACE_GPU_RAW + + ' * 10))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -518,24 +526,24 @@ def build_overview(): panels.append( pie_panel( 11, - "Namespace CPU share", - namespace_cpu_share_expr(), + "Namespace GPU share", + namespace_gpu_share_expr(), {"h": 9, "w": 8, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, - "Namespace RAM share", - namespace_ram_share_expr(), + "Namespace CPU share", + namespace_cpu_share_expr(), {"h": 9, "w": 8, "x": 8, "y": 10}, ) ) panels.append( pie_panel( 13, - "Namespace GPU share", - namespace_gpu_share_expr(), + "Namespace RAM share", + namespace_ram_share_expr(), {"h": 9, "w": 8, "x": 16, "y": 10}, ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 47aa5da..f833b89 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -709,7 +709,7 @@ { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -751,7 +751,7 @@ { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -793,7 +793,7 @@ { "id": 13, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 96136d7..fb4e13a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -718,7 +718,7 @@ data: { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -760,7 +760,7 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -802,7 +802,7 @@ data: { "id": 13, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" }