From 6388ef5c6d797eae0f75331732ba0ec36f460b0e Mon Sep 17 00:00:00 2001 From: jenkins Date: Fri, 22 May 2026 03:08:27 -0300 Subject: [PATCH] monitoring(gpu): add pool utilization counters --- scripts/dashboards_render_atlas.py | 88 +++++++++- scripts/tests/test_dashboards_render_atlas.py | 8 + services/monitoring/dashboards/atlas-gpu.json | 141 ++++++++++++++++ .../monitoring/dashboards/atlas-overview.json | 155 ++++++++++++++++++ .../monitoring/grafana-dashboard-gpu.yaml | 141 ++++++++++++++++ .../grafana-dashboard-overview.yaml | 155 ++++++++++++++++++ 6 files changed, 687 insertions(+), 1 deletion(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 2993ee32..744419ec 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -366,6 +366,18 @@ def gpu_capacity_percent(): return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))" +def gpu_active_devices_expr(): + process_active = "sum(nvidia_gpu_device_utilization_percent > bool 0)" + legacy_active = f"sum(({gpu_util_by_node()}) > bool 0) unless on() nvidia_gpu_device_utilization_percent" + return f"(({process_active}) or ({legacy_active}) or on() vector(0))" + + +def gpu_total_devices_expr(): + process_total = "count(nvidia_gpu_device_utilization_percent)" + legacy_total = f"count({gpu_util_by_node()}) unless on() nvidia_gpu_device_utilization_percent" + return f"(({process_total}) or ({legacy_total}) or on() vector(0))" + + def unattributed_gpu_usage(): return ( 'label_replace((sum(' @@ -375,8 +387,17 @@ def unattributed_gpu_usage(): ) +def gpu_utilization_raw(scope_var): + return f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})" + + +def gpu_pool_used_expr(scope_var): + raw_total = f"(sum({gpu_utilization_raw(scope_var)}) or on() vector(0))" + return f"100 * {raw_total} / clamp_min({gpu_capacity_percent()}, 1)" + + def namespace_gpu_share_expr(scope_var): - utilization_raw = f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})" + utilization_raw = gpu_utilization_raw(scope_var) total_raw = f"(sum({utilization_raw}) or on() vector(0))" capacity = gpu_capacity_percent() utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)" @@ -1851,6 +1872,8 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.", "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.", "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.", + "GPU Pool Used": "Current process-level GPU utilization across the monitored NVIDIA GPU pool.", + "GPU Active Devices": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs.", "Namespace GPU Utilization": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.", "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.", "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.", @@ -2851,6 +2874,39 @@ def build_overview(): gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" + panels.append( + stat_panel( + 48, + "GPU Pool Used", + gpu_pool_used_expr(gpu_scope), + {"h": 2, "w": 4, "x": 8, "y": 21}, + unit="percent", + decimals=1, + instant=True, + thresholds=PERCENT_THRESHOLDS, + links=overview_link("atlas-gpu"), + description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"], + ) + ) + panels.append( + stat_panel( + 49, + "GPU Active Devices", + "", + {"h": 2, "w": 4, "x": 12, "y": 21}, + unit="none", + decimals=0, + text_mode="name_and_value", + instant=True, + targets=[ + {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"}, + {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"}, + ], + links=overview_link("atlas-gpu"), + description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"], + ) + ) + panels.append( pie_panel( 11, @@ -5430,6 +5486,36 @@ def build_gpu_dashboard(): description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.", ) ) + panels.append( + stat_panel( + 5, + "GPU Pool Used", + gpu_pool_used_expr(gpu_scope), + {"h": 3, "w": 6, "x": 0, "y": 16}, + unit="percent", + decimals=1, + instant=True, + thresholds=PERCENT_THRESHOLDS, + description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"], + ) + ) + panels.append( + stat_panel( + 6, + "GPU Active Devices", + "", + {"h": 3, "w": 6, "x": 6, "y": 16}, + unit="none", + decimals=0, + text_mode="name_and_value", + instant=True, + targets=[ + {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"}, + {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"}, + ], + description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"], + ) + ) return { "uid": "atlas-gpu", "title": "Atlas GPU", diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index bf7e7c1f..f2200423 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -166,6 +166,14 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert 'namespace", "idle"' in gpu_expr assert panels_by_title["Namespace GPU Utilization"]["targets"][0]["instant"] is True + gpu_pool_expr = panels_by_title["GPU Pool Used"]["targets"][0]["expr"] + assert "nvidia_namespace_gpu_sm_util_percent" in gpu_pool_expr + assert "nvidia_gpu_device_utilization_percent" in gpu_pool_expr + assert panels_by_title["GPU Pool Used"]["targets"][0]["instant"] is True + active_targets = panels_by_title["GPU Active Devices"]["targets"] + assert any("nvidia_gpu_device_utilization_percent > bool 0" in target["expr"] for target in active_targets) + assert any("count(nvidia_gpu_device_utilization_percent)" in target["expr"] for target in active_targets) + def test_overview_and_testing_panels_all_have_concise_descriptions(): mod = load_module() diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index a99ab173..d7169807 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -189,6 +189,147 @@ } ], "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value." + }, + { + "id": 5, + "type": "stat", + "title": "GPU Pool Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 50 + }, + { + "color": "dark-orange", + "value": 75 + }, + { + "color": "dark-red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool." + }, + { + "id": 6, + "type": "stat", + "title": "GPU Active Devices", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 16 + }, + "targets": [ + { + "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "A", + "legendFormat": "active", + "instant": true + }, + { + "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "B", + "legendFormat": "total", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "dark-green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + }, + "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index fe8527e6..cfc82950 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -3643,6 +3643,161 @@ }, "description": "Database with the most active connections; high values identify the pressure source." }, + { + "id": 48, + "type": "stat", + "title": "GPU Pool Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 8, + "y": 21 + }, + "targets": [ + { + "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 50 + }, + { + "color": "dark-orange", + "value": 75 + }, + { + "color": "dark-red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-gpu dashboard", + "url": "/d/atlas-gpu", + "targetBlank": true + } + ], + "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool." + }, + { + "id": 49, + "type": "stat", + "title": "GPU Active Devices", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 12, + "y": 21 + }, + "targets": [ + { + "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "A", + "legendFormat": "active", + "instant": true + }, + { + "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "B", + "legendFormat": "total", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "dark-green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + }, + "links": [ + { + "title": "Open atlas-gpu dashboard", + "url": "/d/atlas-gpu", + "targetBlank": true + } + ], + "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs." + }, { "id": 11, "type": "piechart", diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index d01d40ae..64577ee2 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -198,6 +198,147 @@ data: } ], "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value." + }, + { + "id": 5, + "type": "stat", + "title": "GPU Pool Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 50 + }, + { + "color": "dark-orange", + "value": 75 + }, + { + "color": "dark-red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool." + }, + { + "id": 6, + "type": "stat", + "title": "GPU Active Devices", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 16 + }, + "targets": [ + { + "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "A", + "legendFormat": "active", + "instant": true + }, + { + "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "B", + "legendFormat": "total", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "dark-green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + }, + "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 87f65f87..23bf6927 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -3652,6 +3652,161 @@ data: }, "description": "Database with the most active connections; high values identify the pressure source." }, + { + "id": 48, + "type": "stat", + "title": "GPU Pool Used", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 8, + "y": 21 + }, + "targets": [ + { + "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 50 + }, + { + "color": "dark-orange", + "value": 75 + }, + { + "color": "dark-red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-gpu dashboard", + "url": "/d/atlas-gpu", + "targetBlank": true + } + ], + "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool." + }, + { + "id": 49, + "type": "stat", + "title": "GPU Active Devices", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 12, + "y": 21 + }, + "targets": [ + { + "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "A", + "legendFormat": "active", + "instant": true + }, + { + "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))", + "refId": "B", + "legendFormat": "total", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "dark-green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name_and_value" + }, + "links": [ + { + "title": "Open atlas-gpu dashboard", + "url": "/d/atlas-gpu", + "targetBlank": true + } + ], + "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs." + }, { "id": 11, "type": "piechart",