From 0708522b280fb3d0978f75458e979360f353f740 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 23:12:16 -0300 Subject: [PATCH] monitoring: add namespace gpu share --- scripts/render_dashboards.py | 46 ++++++++---- .../monitoring/dashboards/atlas-overview.json | 72 +++++++++++++++---- .../grafana-dashboard-overview.yaml | 72 +++++++++++++++---- 3 files changed, 145 insertions(+), 45 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 4e8e5a5..1248984 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -175,6 +175,11 @@ def namespace_ram_share_expr(): return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" +def namespace_gpu_share_expr(): + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )" + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -214,6 +219,9 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) +NAMESPACE_GPU_RAW = ( + 'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW @@ -512,7 +520,7 @@ def build_overview(): 11, "Namespace CPU share", namespace_cpu_share_expr(), - {"h": 9, "w": 12, "x": 0, "y": 10}, + {"h": 9, "w": 8, "x": 0, "y": 10}, ) ) panels.append( @@ -520,14 +528,22 @@ def build_overview(): 12, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 12, "x": 12, "y": 10}, + {"h": 9, "w": 8, "x": 8, "y": 10}, + ) + ) + panels.append( + pie_panel( + 13, + "Namespace GPU share", + namespace_gpu_share_expr(), + {"h": 9, "w": 8, "x": 16, "y": 10}, ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( - 13, + 14, "Worker node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, @@ -541,7 +557,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 14, + 15, "Worker node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, @@ -556,7 +572,7 @@ def build_overview(): panels.append( timeseries_panel( - 15, + 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, @@ -568,7 +584,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 16, + 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, @@ -581,7 +597,7 @@ def build_overview(): panels.append( timeseries_panel( - 17, + 18, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, @@ -593,7 +609,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 18, + 19, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, @@ -606,7 +622,7 @@ def build_overview(): panels.append( timeseries_panel( - 19, + 20, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, @@ -621,7 +637,7 @@ def build_overview(): ) panels.append( { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, @@ -655,10 +671,10 @@ def build_overview(): ) storage_panels = [ - (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + (22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -675,7 +691,7 @@ def build_overview(): panels.append( text_panel( - 25, + 26, "About this dashboard", textwrap.dedent( """\ diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 55c1909..47aa5da 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -716,7 +716,7 @@ }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, @@ -758,8 +758,8 @@ }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ @@ -792,6 +792,48 @@ }, { "id": 13, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -838,7 +880,7 @@ ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -885,7 +927,7 @@ ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -922,7 +964,7 @@ } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -959,7 +1001,7 @@ } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1002,7 +1044,7 @@ ] }, { - "id": 18, + "id": 19, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1045,7 +1087,7 @@ ] }, { - "id": 19, + "id": 20, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1093,7 +1135,7 @@ ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1162,7 +1204,7 @@ ] }, { - "id": 21, + "id": 22, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1233,7 +1275,7 @@ ] }, { - "id": 22, + "id": 23, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1304,7 +1346,7 @@ ] }, { - "id": 23, + "id": 24, "type": "stat", "title": "Astreae free", "datasource": { @@ -1371,7 +1413,7 @@ ] }, { - "id": 24, + "id": 25, "type": "stat", "title": "Asteria free", "datasource": { @@ -1438,7 +1480,7 @@ ] }, { - "id": 25, + "id": 26, "type": "text", "title": "About this dashboard", "gridPos": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index deeeacc..96136d7 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -725,7 +725,7 @@ data: }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, @@ -767,8 +767,8 @@ data: }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ @@ -801,6 +801,48 @@ data: }, { "id": 13, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -847,7 +889,7 @@ data: ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -894,7 +936,7 @@ data: ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -931,7 +973,7 @@ data: } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -968,7 +1010,7 @@ data: } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1011,7 +1053,7 @@ data: ] }, { - "id": 18, + "id": 19, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1054,7 +1096,7 @@ data: ] }, { - "id": 19, + "id": 20, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1102,7 +1144,7 @@ data: ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1171,7 +1213,7 @@ data: ] }, { - "id": 21, + "id": 22, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1242,7 +1284,7 @@ data: ] }, { - "id": 22, + "id": 23, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1313,7 +1355,7 @@ data: ] }, { - "id": 23, + "id": 24, "type": "stat", "title": "Astreae free", "datasource": { @@ -1380,7 +1422,7 @@ data: ] }, { - "id": 24, + "id": 25, "type": "stat", "title": "Asteria free", "datasource": { @@ -1447,7 +1489,7 @@ data: ] }, { - "id": 25, + "id": 26, "type": "text", "title": "About this dashboard", "gridPos": {