monitoring(gpu): add pool utilization counters

2026-05-22 03:08:27 -03:00 · 2026-05-22 03:08:27 -03:00 · 6388ef5c6d
commit 6388ef5c6d
parent 4ce5a67b94
6 changed files with 687 additions and 1 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -366,6 +366,18 @@ def gpu_capacity_percent():
    return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))"
 def gpu_active_devices_expr():
    process_active = "sum(nvidia_gpu_device_utilization_percent > bool 0)"
    legacy_active = f"sum(({gpu_util_by_node()}) > bool 0) unless on() nvidia_gpu_device_utilization_percent"
    return f"(({process_active}) or ({legacy_active}) or on() vector(0))"
 def gpu_total_devices_expr():
    process_total = "count(nvidia_gpu_device_utilization_percent)"
    legacy_total = f"count({gpu_util_by_node()}) unless on() nvidia_gpu_device_utilization_percent"
    return f"(({process_total}) or ({legacy_total}) or on() vector(0))"
 def unattributed_gpu_usage():
    return (
        'label_replace((sum('
@ -375,8 +387,17 @@ def unattributed_gpu_usage():
    )
 def gpu_utilization_raw(scope_var):
    return f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
 def gpu_pool_used_expr(scope_var):
    raw_total = f"(sum({gpu_utilization_raw(scope_var)}) or on() vector(0))"
    return f"100 * {raw_total} / clamp_min({gpu_capacity_percent()}, 1)"
 def namespace_gpu_share_expr(scope_var):
-    utilization_raw = f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
+    utilization_raw = gpu_utilization_raw(scope_var)
    total_raw = f"(sum({utilization_raw}) or on() vector(0))"
    capacity = gpu_capacity_percent()
    utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)"
@ -1851,6 +1872,8 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
    "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
    "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
    "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
    "GPU Pool Used": "Current process-level GPU utilization across the monitored NVIDIA GPU pool.",
    "GPU Active Devices": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs.",
    "Namespace GPU Utilization": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.",
    "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
    "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
@ -2851,6 +2874,39 @@ def build_overview():
    gpu_scope = "$namespace_scope_gpu"
    ram_scope = "$namespace_scope_ram"
    panels.append(
        stat_panel(
            48,
            "GPU Pool Used",
            gpu_pool_used_expr(gpu_scope),
            {"h": 2, "w": 4, "x": 8, "y": 21},
            unit="percent",
            decimals=1,
            instant=True,
            thresholds=PERCENT_THRESHOLDS,
            links=overview_link("atlas-gpu"),
            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
        )
    )
    panels.append(
        stat_panel(
            49,
            "GPU Active Devices",
            "",
            {"h": 2, "w": 4, "x": 12, "y": 21},
            unit="none",
            decimals=0,
            text_mode="name_and_value",
            instant=True,
            targets=[
                {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
                {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
            ],
            links=overview_link("atlas-gpu"),
            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
        )
    )
    panels.append(
        pie_panel(
            11,
@ -5430,6 +5486,36 @@ def build_gpu_dashboard():
            description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.",
        )
    )
    panels.append(
        stat_panel(
            5,
            "GPU Pool Used",
            gpu_pool_used_expr(gpu_scope),
            {"h": 3, "w": 6, "x": 0, "y": 16},
            unit="percent",
            decimals=1,
            instant=True,
            thresholds=PERCENT_THRESHOLDS,
            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
        )
    )
    panels.append(
        stat_panel(
            6,
            "GPU Active Devices",
            "",
            {"h": 3, "w": 6, "x": 6, "y": 16},
            unit="none",
            decimals=0,
            text_mode="name_and_value",
            instant=True,
            targets=[
                {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
                {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
            ],
            description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
        )
    )
    return {
        "uid": "atlas-gpu",
        "title": "Atlas GPU",
--- a/scripts/tests/test_dashboards_render_atlas.py
+++ b/scripts/tests/test_dashboards_render_atlas.py
@ -166,6 +166,14 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
    assert 'namespace", "idle"' in gpu_expr
    assert panels_by_title["Namespace GPU Utilization"]["targets"][0]["instant"] is True
    gpu_pool_expr = panels_by_title["GPU Pool Used"]["targets"][0]["expr"]
    assert "nvidia_namespace_gpu_sm_util_percent" in gpu_pool_expr
    assert "nvidia_gpu_device_utilization_percent" in gpu_pool_expr
    assert panels_by_title["GPU Pool Used"]["targets"][0]["instant"] is True
    active_targets = panels_by_title["GPU Active Devices"]["targets"]
    assert any("nvidia_gpu_device_utilization_percent > bool 0" in target["expr"] for target in active_targets)
    assert any("count(nvidia_gpu_device_utilization_percent)" in target["expr"] for target in active_targets)
 def test_overview_and_testing_panels_all_have_concise_descriptions():
    mod = load_module()
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@ -189,6 +189,147 @@
        }
      ],
      "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
    },
    {
      "id": 5,
      "type": "stat",
      "title": "GPU Pool Used",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 3,
        "w": 6,
        "x": 0,
        "y": 16
      },
      "targets": [
        {
          "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
          "refId": "A",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "dark-green",
                "value": null
              },
              {
                "color": "dark-yellow",
                "value": 50
              },
              {
                "color": "dark-orange",
                "value": 75
              },
              {
                "color": "dark-red",
                "value": 91.5
              }
            ]
          },
          "unit": "percent",
          "custom": {
            "displayMode": "auto"
          },
          "decimals": 1
        },
        "overrides": []
      },
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "center",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "value"
      },
      "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
    },
    {
      "id": 6,
      "type": "stat",
      "title": "GPU Active Devices",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 3,
        "w": 6,
        "x": 6,
        "y": 16
      },
      "targets": [
        {
          "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
          "refId": "A",
          "legendFormat": "active",
          "instant": true
        },
        {
          "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
          "refId": "B",
          "legendFormat": "total",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(115, 115, 115, 1)",
                "value": null
              },
              {
                "color": "dark-green",
                "value": 1
              }
            ]
          },
          "unit": "none",
          "custom": {
            "displayMode": "auto"
          },
          "decimals": 0
        },
        "overrides": []
      },
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "center",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "name_and_value"
      },
      "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
    }
  ],
  "time": {
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -3643,6 +3643,161 @@
      },
      "description": "Database with the most active connections; high values identify the pressure source."
    },
    {
      "id": 48,
      "type": "stat",
      "title": "GPU Pool Used",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 8,
        "y": 21
      },
      "targets": [
        {
          "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
          "refId": "A",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "dark-green",
                "value": null
              },
              {
                "color": "dark-yellow",
                "value": 50
              },
              {
                "color": "dark-orange",
                "value": 75
              },
              {
                "color": "dark-red",
                "value": 91.5
              }
            ]
          },
          "unit": "percent",
          "custom": {
            "displayMode": "auto"
          },
          "decimals": 1
        },
        "overrides": []
      },
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "center",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "value"
      },
      "links": [
        {
          "title": "Open atlas-gpu dashboard",
          "url": "/d/atlas-gpu",
          "targetBlank": true
        }
      ],
      "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
    },
    {
      "id": 49,
      "type": "stat",
      "title": "GPU Active Devices",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 2,
        "w": 4,
        "x": 12,
        "y": 21
      },
      "targets": [
        {
          "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
          "refId": "A",
          "legendFormat": "active",
          "instant": true
        },
        {
          "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
          "refId": "B",
          "legendFormat": "total",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "rgba(115, 115, 115, 1)",
                "value": null
              },
              {
                "color": "dark-green",
                "value": 1
              }
            ]
          },
          "unit": "none",
          "custom": {
            "displayMode": "auto"
          },
          "decimals": 0
        },
        "overrides": []
      },
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "center",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "name_and_value"
      },
      "links": [
        {
          "title": "Open atlas-gpu dashboard",
          "url": "/d/atlas-gpu",
          "targetBlank": true
        }
      ],
      "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
    },
    {
      "id": 11,
      "type": "piechart",
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@ -198,6 +198,147 @@ data:
            }
          ],
          "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
        },
        {
          "id": 5,
          "type": "stat",
          "title": "GPU Pool Used",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 3,
            "w": 6,
            "x": 0,
            "y": 16
          },
          "targets": [
            {
              "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
              "refId": "A",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "thresholds"
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "dark-green",
                    "value": null
                  },
                  {
                    "color": "dark-yellow",
                    "value": 50
                  },
                  {
                    "color": "dark-orange",
                    "value": 75
                  },
                  {
                    "color": "dark-red",
                    "value": 91.5
                  }
                ]
              },
              "unit": "percent",
              "custom": {
                "displayMode": "auto"
              },
              "decimals": 1
            },
            "overrides": []
          },
          "options": {
            "colorMode": "value",
            "graphMode": "area",
            "justifyMode": "center",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            },
            "textMode": "value"
          },
          "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
        },
        {
          "id": 6,
          "type": "stat",
          "title": "GPU Active Devices",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 3,
            "w": 6,
            "x": 6,
            "y": 16
          },
          "targets": [
            {
              "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
              "refId": "A",
              "legendFormat": "active",
              "instant": true
            },
            {
              "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
              "refId": "B",
              "legendFormat": "total",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "thresholds"
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "rgba(115, 115, 115, 1)",
                    "value": null
                  },
                  {
                    "color": "dark-green",
                    "value": 1
                  }
                ]
              },
              "unit": "none",
              "custom": {
                "displayMode": "auto"
              },
              "decimals": 0
            },
            "overrides": []
          },
          "options": {
            "colorMode": "value",
            "graphMode": "area",
            "justifyMode": "center",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            },
            "textMode": "name_and_value"
          },
          "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
        }
      ],
      "time": {
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -3652,6 +3652,161 @@ data:
          },
          "description": "Database with the most active connections; high values identify the pressure source."
        },
        {
          "id": 48,
          "type": "stat",
          "title": "GPU Pool Used",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 2,
            "w": 4,
            "x": 8,
            "y": 21
          },
          "targets": [
            {
              "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
              "refId": "A",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "thresholds"
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "dark-green",
                    "value": null
                  },
                  {
                    "color": "dark-yellow",
                    "value": 50
                  },
                  {
                    "color": "dark-orange",
                    "value": 75
                  },
                  {
                    "color": "dark-red",
                    "value": 91.5
                  }
                ]
              },
              "unit": "percent",
              "custom": {
                "displayMode": "auto"
              },
              "decimals": 1
            },
            "overrides": []
          },
          "options": {
            "colorMode": "value",
            "graphMode": "area",
            "justifyMode": "center",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            },
            "textMode": "value"
          },
          "links": [
            {
              "title": "Open atlas-gpu dashboard",
              "url": "/d/atlas-gpu",
              "targetBlank": true
            }
          ],
          "description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
        },
        {
          "id": 49,
          "type": "stat",
          "title": "GPU Active Devices",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 2,
            "w": 4,
            "x": 12,
            "y": 21
          },
          "targets": [
            {
              "expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
              "refId": "A",
              "legendFormat": "active",
              "instant": true
            },
            {
              "expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
              "refId": "B",
              "legendFormat": "total",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "color": {
                "mode": "thresholds"
              },
              "mappings": [],
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "rgba(115, 115, 115, 1)",
                    "value": null
                  },
                  {
                    "color": "dark-green",
                    "value": 1
                  }
                ]
              },
              "unit": "none",
              "custom": {
                "displayMode": "auto"
              },
              "decimals": 0
            },
            "overrides": []
          },
          "options": {
            "colorMode": "value",
            "graphMode": "area",
            "justifyMode": "center",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            },
            "textMode": "name_and_value"
          },
          "links": [
            {
              "title": "Open atlas-gpu dashboard",
              "url": "/d/atlas-gpu",
              "targetBlank": true
            }
          ],
          "description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
        },
        {
          "id": 11,
          "type": "piechart",