monitoring(gpu): show activity share by namespace

2026-05-22 04:21:46 -03:00 · 2026-05-22 04:21:46 -03:00 · 5e27384ea2
commit 5e27384ea2
parent ec972a52f1
5 changed files with 14 additions and 679 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -395,22 +395,14 @@ def gpu_pool_used_expr(scope_var):


 def namespace_gpu_share_expr(scope_var):
-    utilization_raw = gpu_utilization_raw(scope_var)
-    total_raw = f"(sum({utilization_raw}) or on() vector(0))"
-    capacity = gpu_capacity_percent()
-    utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)"
-    total = f"(sum({utilization}) or on() vector(0))"
-    unused = (
-        'label_replace(clamp_min('
-        f"100 - {total}"
-        ', 0), "namespace", "unused", "", "") '
-        f"and on() ({total_raw} > 0)"
-    )
+    activity = gpu_utilization_raw(scope_var)
+    total = f"(sum({activity}) or on() vector(0))"
+    share = f"100 * ({activity}) / clamp_min({total}, 1)"
    idle = (
        'label_replace(vector(100), "namespace", "idle", "", "") '
-        f"and on() ({total_raw} == 0)"
+        f"and on() ({total} == 0)"
    )
-    return f"({utilization}) or ({unused}) or ({idle})"
+    return f"({share}) or ({idle})"


 PROBLEM_PODS_EXPR = (
@ -1870,9 +1862,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
    "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
    "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
    "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
-    "GPU Pool Used": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources.",
-    "GPU Active Devices": "Active GPU devices compared with total monitored GPU devices.",
-    "Namespace GPU Utilization": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution.",
+    "Namespace GPU Utilization": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero.",
    "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
    "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
    "Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
@ -2872,39 +2862,6 @@ def build_overview():
    gpu_scope = "$namespace_scope_gpu"
    ram_scope = "$namespace_scope_ram"

-    panels.append(
-        stat_panel(
-            48,
-            "GPU Pool Used",
-            gpu_pool_used_expr(gpu_scope),
-            {"h": 2, "w": 4, "x": 8, "y": 21},
-            unit="percent",
-            decimals=1,
-            instant=True,
-            thresholds=PERCENT_THRESHOLDS,
-            links=overview_link("atlas-gpu"),
-            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
-        )
-    )
-    panels.append(
-        stat_panel(
-            49,
-            "GPU Active Devices",
-            "",
-            {"h": 2, "w": 4, "x": 12, "y": 21},
-            unit="none",
-            decimals=0,
-            text_mode="name_and_value",
-            instant=True,
-            targets=[
-                {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
-                {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
-            ],
-            links=overview_link("atlas-gpu"),
-            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
-        )
-    )
-
    panels.append(
        pie_panel(
            11,
@ -5484,36 +5441,6 @@ def build_gpu_dashboard():
            description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.",
        )
    )
-    panels.append(
-        stat_panel(
-            5,
-            "GPU Pool Used",
-            gpu_pool_used_expr(gpu_scope),
-            {"h": 3, "w": 6, "x": 0, "y": 16},
-            unit="percent",
-            decimals=1,
-            instant=True,
-            thresholds=PERCENT_THRESHOLDS,
-            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
-        )
-    )
-    panels.append(
-        stat_panel(
-            6,
-            "GPU Active Devices",
-            "",
-            {"h": 3, "w": 6, "x": 6, "y": 16},
-            unit="none",
-            decimals=0,
-            text_mode="name_and_value",
-            instant=True,
-            targets=[
-                {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
-                {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
-            ],
-            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
-        )
-    )
    return {
        "uid": "atlas-gpu",
        "title": "Atlas GPU",
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@ -20,7 +20,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
+          "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}",
          "instant": true
@ -72,7 +72,7 @@
          "targetBlank": false
        }
      ],
-      "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
+      "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
    },
    {
      "id": 2,
@ -189,147 +189,6 @@
        }
      ],
      "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
-    },
-    {
-      "id": 5,
-      "type": "stat",
-      "title": "GPU Pool Used",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 3,
-        "w": 6,
-        "x": 0,
-        "y": 16
-      },
-      "targets": [
-        {
-          "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "dark-green",
-                "value": null
-              },
-              {
-                "color": "dark-yellow",
-                "value": 50
-              },
-              {
-                "color": "dark-orange",
-                "value": 75
-              },
-              {
-                "color": "dark-red",
-                "value": 91.5
-              }
-            ]
-          },
-          "unit": "percent",
-          "custom": {
-            "displayMode": "auto"
-          },
-          "decimals": 1
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      },
-      "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
-    },
-    {
-      "id": 6,
-      "type": "stat",
-      "title": "GPU Active Devices",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 3,
-        "w": 6,
-        "x": 6,
-        "y": 16
-      },
-      "targets": [
-        {
-          "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
-          "refId": "A",
-          "legendFormat": "active",
-          "instant": true
-        },
-        {
-          "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
-          "refId": "B",
-          "legendFormat": "total",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "dark-green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          },
-          "decimals": 0
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "name_and_value"
-      },
-      "description": "Active GPU devices compared with total monitored GPU devices."
    }
  ],
  "time": {
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -3643,161 +3643,6 @@
      },
      "description": "Database with the most active connections; high values identify the pressure source."
    },
-    {
-      "id": 48,
-      "type": "stat",
-      "title": "GPU Pool Used",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 2,
-        "w": 4,
-        "x": 8,
-        "y": 21
-      },
-      "targets": [
-        {
-          "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "dark-green",
-                "value": null
-              },
-              {
-                "color": "dark-yellow",
-                "value": 50
-              },
-              {
-                "color": "dark-orange",
-                "value": 75
-              },
-              {
-                "color": "dark-red",
-                "value": 91.5
-              }
-            ]
-          },
-          "unit": "percent",
-          "custom": {
-            "displayMode": "auto"
-          },
-          "decimals": 1
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      },
-      "links": [
-        {
-          "title": "Open atlas-gpu dashboard",
-          "url": "/d/atlas-gpu",
-          "targetBlank": true
-        }
-      ],
-      "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
-    },
-    {
-      "id": 49,
-      "type": "stat",
-      "title": "GPU Active Devices",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 2,
-        "w": 4,
-        "x": 12,
-        "y": 21
-      },
-      "targets": [
-        {
-          "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
-          "refId": "A",
-          "legendFormat": "active",
-          "instant": true
-        },
-        {
-          "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
-          "refId": "B",
-          "legendFormat": "total",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "dark-green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          },
-          "decimals": 0
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "name_and_value"
-      },
-      "links": [
-        {
-          "title": "Open atlas-gpu dashboard",
-          "url": "/d/atlas-gpu",
-          "targetBlank": true
-        }
-      ],
-      "description": "Active GPU devices compared with total monitored GPU devices."
-    },
    {
      "id": 11,
      "type": "piechart",
@ -3883,7 +3728,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
+          "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}",
          "instant": true
@ -3935,7 +3780,7 @@
          "targetBlank": false
        }
      ],
-      "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
+      "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
    },
    {
      "id": 13,
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@ -29,7 +29,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
+              "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}",
              "instant": true
@ -81,7 +81,7 @@ data:
              "targetBlank": false
            }
          ],
-          "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
+          "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
        },
        {
          "id": 2,
@ -198,147 +198,6 @@ data:
            }
          ],
          "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
-        },
-        {
-          "id": 5,
-          "type": "stat",
-          "title": "GPU Pool Used",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 3,
-            "w": 6,
-            "x": 0,
-            "y": 16
-          },
-          "targets": [
-            {
-              "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "dark-green",
-                    "value": null
-                  },
-                  {
-                    "color": "dark-yellow",
-                    "value": 50
-                  },
-                  {
-                    "color": "dark-orange",
-                    "value": 75
-                  },
-                  {
-                    "color": "dark-red",
-                    "value": 91.5
-                  }
-                ]
-              },
-              "unit": "percent",
-              "custom": {
-                "displayMode": "auto"
-              },
-              "decimals": 1
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          },
-          "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
-        },
-        {
-          "id": 6,
-          "type": "stat",
-          "title": "GPU Active Devices",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 3,
-            "w": 6,
-            "x": 6,
-            "y": 16
-          },
-          "targets": [
-            {
-              "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
-              "refId": "A",
-              "legendFormat": "active",
-              "instant": true
-            },
-            {
-              "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
-              "refId": "B",
-              "legendFormat": "total",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "dark-green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              },
-              "decimals": 0
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "name_and_value"
-          },
-          "description": "Active GPU devices compared with total monitored GPU devices."
        }
      ],
      "time": {
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -3652,161 +3652,6 @@ data:
          },
          "description": "Database with the most active connections; high values identify the pressure source."
        },
-        {
-          "id": 48,
-          "type": "stat",
-          "title": "GPU Pool Used",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 2,
-            "w": 4,
-            "x": 8,
-            "y": 21
-          },
-          "targets": [
-            {
-              "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "dark-green",
-                    "value": null
-                  },
-                  {
-                    "color": "dark-yellow",
-                    "value": 50
-                  },
-                  {
-                    "color": "dark-orange",
-                    "value": 75
-                  },
-                  {
-                    "color": "dark-red",
-                    "value": 91.5
-                  }
-                ]
-              },
-              "unit": "percent",
-              "custom": {
-                "displayMode": "auto"
-              },
-              "decimals": 1
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          },
-          "links": [
-            {
-              "title": "Open atlas-gpu dashboard",
-              "url": "/d/atlas-gpu",
-              "targetBlank": true
-            }
-          ],
-          "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
-        },
-        {
-          "id": 49,
-          "type": "stat",
-          "title": "GPU Active Devices",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 2,
-            "w": 4,
-            "x": 12,
-            "y": 21
-          },
-          "targets": [
-            {
-              "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
-              "refId": "A",
-              "legendFormat": "active",
-              "instant": true
-            },
-            {
-              "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
-              "refId": "B",
-              "legendFormat": "total",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "dark-green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              },
-              "decimals": 0
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "name_and_value"
-          },
-          "links": [
-            {
-              "title": "Open atlas-gpu dashboard",
-              "url": "/d/atlas-gpu",
-              "targetBlank": true
-            }
-          ],
-          "description": "Active GPU devices compared with total monitored GPU devices."
-        },
        {
          "id": 11,
          "type": "piechart",
@ -3892,7 +3737,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
+              "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}",
              "instant": true
@ -3944,7 +3789,7 @@ data:
              "targetBlank": false
            }
          ],
-          "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
+          "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
        },
        {
          "id": 13,