From 6388ef5c6d797eae0f75331732ba0ec36f460b0e Mon Sep 17 00:00:00 2001
From: jenkins <jenkins@bstein.dev>
Date: Fri, 22 May 2026 03:08:27 -0300
Subject: [PATCH] monitoring(gpu): add pool utilization counters

---
 scripts/dashboards_render_atlas.py            |  88 +++++++++-
 scripts/tests/test_dashboards_render_atlas.py |   8 +
 services/monitoring/dashboards/atlas-gpu.json | 141 ++++++++++++++++
 .../monitoring/dashboards/atlas-overview.json | 155 ++++++++++++++++++
 .../monitoring/grafana-dashboard-gpu.yaml     | 141 ++++++++++++++++
 .../grafana-dashboard-overview.yaml           | 155 ++++++++++++++++++
 6 files changed, 687 insertions(+), 1 deletion(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 2993ee32..744419ec 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -366,6 +366,18 @@ def gpu_capacity_percent():
     return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))"
 
 
+def gpu_active_devices_expr():
+    process_active = "sum(nvidia_gpu_device_utilization_percent > bool 0)"
+    legacy_active = f"sum(({gpu_util_by_node()}) > bool 0) unless on() nvidia_gpu_device_utilization_percent"
+    return f"(({process_active}) or ({legacy_active}) or on() vector(0))"
+
+
+def gpu_total_devices_expr():
+    process_total = "count(nvidia_gpu_device_utilization_percent)"
+    legacy_total = f"count({gpu_util_by_node()}) unless on() nvidia_gpu_device_utilization_percent"
+    return f"(({process_total}) or ({legacy_total}) or on() vector(0))"
+
+
 def unattributed_gpu_usage():
     return (
         'label_replace((sum('
@@ -375,8 +387,17 @@ def unattributed_gpu_usage():
     )
 
 
+def gpu_utilization_raw(scope_var):
+    return f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
+
+
+def gpu_pool_used_expr(scope_var):
+    raw_total = f"(sum({gpu_utilization_raw(scope_var)}) or on() vector(0))"
+    return f"100 * {raw_total} / clamp_min({gpu_capacity_percent()}, 1)"
+
+
 def namespace_gpu_share_expr(scope_var):
-    utilization_raw = f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
+    utilization_raw = gpu_utilization_raw(scope_var)
     total_raw = f"(sum({utilization_raw}) or on() vector(0))"
     capacity = gpu_capacity_percent()
     utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)"
@@ -1851,6 +1872,8 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
     "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
     "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
     "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
+    "GPU Pool Used": "Current process-level GPU utilization across the monitored NVIDIA GPU pool.",
+    "GPU Active Devices": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs.",
     "Namespace GPU Utilization": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.",
     "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
     "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
@@ -2851,6 +2874,39 @@ def build_overview():
     gpu_scope = "$namespace_scope_gpu"
     ram_scope = "$namespace_scope_ram"
 
+    panels.append(
+        stat_panel(
+            48,
+            "GPU Pool Used",
+            gpu_pool_used_expr(gpu_scope),
+            {"h": 2, "w": 4, "x": 8, "y": 21},
+            unit="percent",
+            decimals=1,
+            instant=True,
+            thresholds=PERCENT_THRESHOLDS,
+            links=overview_link("atlas-gpu"),
+            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
+        )
+    )
+    panels.append(
+        stat_panel(
+            49,
+            "GPU Active Devices",
+            "",
+            {"h": 2, "w": 4, "x": 12, "y": 21},
+            unit="none",
+            decimals=0,
+            text_mode="name_and_value",
+            instant=True,
+            targets=[
+                {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
+                {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
+            ],
+            links=overview_link("atlas-gpu"),
+            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
+        )
+    )
+
     panels.append(
         pie_panel(
             11,
@@ -5430,6 +5486,36 @@ def build_gpu_dashboard():
             description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.",
         )
     )
+    panels.append(
+        stat_panel(
+            5,
+            "GPU Pool Used",
+            gpu_pool_used_expr(gpu_scope),
+            {"h": 3, "w": 6, "x": 0, "y": 16},
+            unit="percent",
+            decimals=1,
+            instant=True,
+            thresholds=PERCENT_THRESHOLDS,
+            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
+        )
+    )
+    panels.append(
+        stat_panel(
+            6,
+            "GPU Active Devices",
+            "",
+            {"h": 3, "w": 6, "x": 6, "y": 16},
+            unit="none",
+            decimals=0,
+            text_mode="name_and_value",
+            instant=True,
+            targets=[
+                {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
+                {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
+            ],
+            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
+        )
+    )
     return {
         "uid": "atlas-gpu",
         "title": "Atlas GPU",
diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py
index bf7e7c1f..f2200423 100644
--- a/scripts/tests/test_dashboards_render_atlas.py
+++ b/scripts/tests/test_dashboards_render_atlas.py
@@ -166,6 +166,14 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
     assert 'namespace", "idle"' in gpu_expr
     assert panels_by_title["Namespace GPU Utilization"]["targets"][0]["instant"] is True
 
+    gpu_pool_expr = panels_by_title["GPU Pool Used"]["targets"][0]["expr"]
+    assert "nvidia_namespace_gpu_sm_util_percent" in gpu_pool_expr
+    assert "nvidia_gpu_device_utilization_percent" in gpu_pool_expr
+    assert panels_by_title["GPU Pool Used"]["targets"][0]["instant"] is True
+    active_targets = panels_by_title["GPU Active Devices"]["targets"]
+    assert any("nvidia_gpu_device_utilization_percent > bool 0" in target["expr"] for target in active_targets)
+    assert any("count(nvidia_gpu_device_utilization_percent)" in target["expr"] for target in active_targets)
+
 
 def test_overview_and_testing_panels_all_have_concise_descriptions():
     mod = load_module()
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index a99ab173..d7169807 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -189,6 +189,147 @@
         }
       ],
       "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "GPU Pool Used",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 6,
+        "x": 0,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "dark-green",
+                "value": null
+              },
+              {
+                "color": "dark-yellow",
+                "value": 50
+              },
+              {
+                "color": "dark-orange",
+                "value": 75
+              },
+              {
+                "color": "dark-red",
+                "value": 91.5
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 1
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      },
+      "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "GPU Active Devices",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 6,
+        "x": 6,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+          "refId": "A",
+          "legendFormat": "active",
+          "instant": true
+        },
+        {
+          "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+          "refId": "B",
+          "legendFormat": "total",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "dark-green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "name_and_value"
+      },
+      "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
     }
   ],
   "time": {
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index fe8527e6..cfc82950 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -3643,6 +3643,161 @@
       },
       "description": "Database with the most active connections; high values identify the pressure source."
     },
+    {
+      "id": 48,
+      "type": "stat",
+      "title": "GPU Pool Used",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 2,
+        "w": 4,
+        "x": 8,
+        "y": 21
+      },
+      "targets": [
+        {
+          "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "dark-green",
+                "value": null
+              },
+              {
+                "color": "dark-yellow",
+                "value": 50
+              },
+              {
+                "color": "dark-orange",
+                "value": 75
+              },
+              {
+                "color": "dark-red",
+                "value": 91.5
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 1
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      },
+      "links": [
+        {
+          "title": "Open atlas-gpu dashboard",
+          "url": "/d/atlas-gpu",
+          "targetBlank": true
+        }
+      ],
+      "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
+    },
+    {
+      "id": 49,
+      "type": "stat",
+      "title": "GPU Active Devices",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 2,
+        "w": 4,
+        "x": 12,
+        "y": 21
+      },
+      "targets": [
+        {
+          "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+          "refId": "A",
+          "legendFormat": "active",
+          "instant": true
+        },
+        {
+          "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+          "refId": "B",
+          "legendFormat": "total",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "dark-green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "name_and_value"
+      },
+      "links": [
+        {
+          "title": "Open atlas-gpu dashboard",
+          "url": "/d/atlas-gpu",
+          "targetBlank": true
+        }
+      ],
+      "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
+    },
     {
       "id": 11,
       "type": "piechart",
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index d01d40ae..64577ee2 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -198,6 +198,147 @@ data:
             }
           ],
           "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
+        },
+        {
+          "id": 5,
+          "type": "stat",
+          "title": "GPU Pool Used",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 6,
+            "x": 0,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "dark-green",
+                    "value": null
+                  },
+                  {
+                    "color": "dark-yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "dark-orange",
+                    "value": 75
+                  },
+                  {
+                    "color": "dark-red",
+                    "value": 91.5
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 1
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          },
+          "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
+        },
+        {
+          "id": 6,
+          "type": "stat",
+          "title": "GPU Active Devices",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 6,
+            "x": 6,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+              "refId": "A",
+              "legendFormat": "active",
+              "instant": true
+            },
+            {
+              "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+              "refId": "B",
+              "legendFormat": "total",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "dark-green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 0
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "name_and_value"
+          },
+          "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
         }
       ],
       "time": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 87f65f87..23bf6927 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -3652,6 +3652,161 @@ data:
           },
           "description": "Database with the most active connections; high values identify the pressure source."
         },
+        {
+          "id": 48,
+          "type": "stat",
+          "title": "GPU Pool Used",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 2,
+            "w": 4,
+            "x": 8,
+            "y": 21
+          },
+          "targets": [
+            {
+              "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "dark-green",
+                    "value": null
+                  },
+                  {
+                    "color": "dark-yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "dark-orange",
+                    "value": 75
+                  },
+                  {
+                    "color": "dark-red",
+                    "value": 91.5
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 1
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          },
+          "links": [
+            {
+              "title": "Open atlas-gpu dashboard",
+              "url": "/d/atlas-gpu",
+              "targetBlank": true
+            }
+          ],
+          "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
+        },
+        {
+          "id": 49,
+          "type": "stat",
+          "title": "GPU Active Devices",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 2,
+            "w": 4,
+            "x": 12,
+            "y": 21
+          },
+          "targets": [
+            {
+              "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+              "refId": "A",
+              "legendFormat": "active",
+              "instant": true
+            },
+            {
+              "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
+              "refId": "B",
+              "legendFormat": "total",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "dark-green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 0
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "name_and_value"
+          },
+          "links": [
+            {
+              "title": "Open atlas-gpu dashboard",
+              "url": "/d/atlas-gpu",
+              "targetBlank": true
+            }
+          ],
+          "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
+        },
         {
           "id": 11,
           "type": "piechart",