diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 166e427b..7f9a5921 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -395,22 +395,14 @@ def gpu_pool_used_expr(scope_var): def namespace_gpu_share_expr(scope_var): - utilization_raw = gpu_utilization_raw(scope_var) - total_raw = f"(sum({utilization_raw}) or on() vector(0))" - capacity = gpu_capacity_percent() - utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)" - total = f"(sum({utilization}) or on() vector(0))" - unused = ( - 'label_replace(clamp_min(' - f"100 - {total}" - ', 0), "namespace", "unused", "", "") ' - f"and on() ({total_raw} > 0)" - ) + activity = gpu_utilization_raw(scope_var) + total = f"(sum({activity}) or on() vector(0))" + share = f"100 * ({activity}) / clamp_min({total}, 1)" idle = ( 'label_replace(vector(100), "namespace", "idle", "", "") ' - f"and on() ({total_raw} == 0)" + f"and on() ({total} == 0)" ) - return f"({utilization}) or ({unused}) or ({idle})" + return f"({share}) or ({idle})" PROBLEM_PODS_EXPR = ( @@ -1870,9 +1862,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.", "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.", "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.", - "GPU Pool Used": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources.", - "GPU Active Devices": "Active GPU devices compared with total monitored GPU devices.", - "Namespace GPU Utilization": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution.", + "Namespace GPU Utilization": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero.", "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.", "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.", "Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.", @@ -2872,39 +2862,6 @@ def build_overview(): gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" - panels.append( - stat_panel( - 48, - "GPU Pool Used", - gpu_pool_used_expr(gpu_scope), - {"h": 2, "w": 4, "x": 8, "y": 21}, - unit="percent", - decimals=1, - instant=True, - thresholds=PERCENT_THRESHOLDS, - links=overview_link("atlas-gpu"), - description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"], - ) - ) - panels.append( - stat_panel( - 49, - "GPU Active Devices", - "", - {"h": 2, "w": 4, "x": 12, "y": 21}, - unit="none", - decimals=0, - text_mode="name_and_value", - instant=True, - targets=[ - {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"}, - {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"}, - ], - links=overview_link("atlas-gpu"), - description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"], - ) - ) - panels.append( pie_panel( 11, @@ -5484,36 +5441,6 @@ def build_gpu_dashboard(): description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.", ) ) - panels.append( - stat_panel( - 5, - "GPU Pool Used", - gpu_pool_used_expr(gpu_scope), - {"h": 3, "w": 6, "x": 0, "y": 16}, - unit="percent", - decimals=1, - instant=True, - thresholds=PERCENT_THRESHOLDS, - description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"], - ) - ) - panels.append( - stat_panel( - 6, - "GPU Active Devices", - "", - {"h": 3, "w": 6, "x": 6, "y": 16}, - unit="none", - decimals=0, - text_mode="name_and_value", - instant=True, - targets=[ - {"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"}, - {"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"}, - ], - description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"], - ) - ) return { "uid": "atlas-gpu", "title": "Atlas GPU", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index d872424c..6bc000fd 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}", "instant": true @@ -72,7 +72,7 @@ "targetBlank": false } ], - "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution." + "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero." }, { "id": 2, @@ -189,147 +189,6 @@ } ], "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value." - }, - { - "id": 5, - "type": "stat", - "title": "GPU Pool Used", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 0, - "y": 16 - }, - "targets": [ - { - "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "dark-green", - "value": null - }, - { - "color": "dark-yellow", - "value": 50 - }, - { - "color": "dark-orange", - "value": 75 - }, - { - "color": "dark-red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - }, - "decimals": 1 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources." - }, - { - "id": 6, - "type": "stat", - "title": "GPU Active Devices", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 6, - "y": 16 - }, - "targets": [ - { - "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))", - "refId": "A", - "legendFormat": "active", - "instant": true - }, - { - "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))", - "refId": "B", - "legendFormat": "total", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "dark-green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "name_and_value" - }, - "description": "Active GPU devices compared with total monitored GPU devices." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 04d6b83f..cefbb039 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -3643,161 +3643,6 @@ }, "description": "Database with the most active connections; high values identify the pressure source." }, - { - "id": 48, - "type": "stat", - "title": "GPU Pool Used", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 8, - "y": 21 - }, - "targets": [ - { - "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "dark-green", - "value": null - }, - { - "color": "dark-yellow", - "value": 50 - }, - { - "color": "dark-orange", - "value": 75 - }, - { - "color": "dark-red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - }, - "decimals": 1 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-gpu dashboard", - "url": "/d/atlas-gpu", - "targetBlank": true - } - ], - "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources." - }, - { - "id": 49, - "type": "stat", - "title": "GPU Active Devices", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 12, - "y": 21 - }, - "targets": [ - { - "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))", - "refId": "A", - "legendFormat": "active", - "instant": true - }, - { - "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))", - "refId": "B", - "legendFormat": "total", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "dark-green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "name_and_value" - }, - "links": [ - { - "title": "Open atlas-gpu dashboard", - "url": "/d/atlas-gpu", - "targetBlank": true - } - ], - "description": "Active GPU devices compared with total monitored GPU devices." - }, { "id": 11, "type": "piechart", @@ -3883,7 +3728,7 @@ }, "targets": [ { - "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}", "instant": true @@ -3935,7 +3780,7 @@ "targetBlank": false } ], - "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution." + "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero." }, { "id": 13, diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index cde2665a..40692ea7 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}", "instant": true @@ -81,7 +81,7 @@ data: "targetBlank": false } ], - "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution." + "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero." }, { "id": 2, @@ -198,147 +198,6 @@ data: } ], "description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value." - }, - { - "id": 5, - "type": "stat", - "title": "GPU Pool Used", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 0, - "y": 16 - }, - "targets": [ - { - "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "dark-green", - "value": null - }, - { - "color": "dark-yellow", - "value": 50 - }, - { - "color": "dark-orange", - "value": 75 - }, - { - "color": "dark-red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - }, - "decimals": 1 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources." - }, - { - "id": 6, - "type": "stat", - "title": "GPU Active Devices", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 6, - "y": 16 - }, - "targets": [ - { - "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))", - "refId": "A", - "legendFormat": "active", - "instant": true - }, - { - "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))", - "refId": "B", - "legendFormat": "total", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "dark-green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "name_and_value" - }, - "description": "Active GPU devices compared with total monitored GPU devices." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 6a4cc3b6..fa649b3a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -3652,161 +3652,6 @@ data: }, "description": "Database with the most active connections; high values identify the pressure source." }, - { - "id": 48, - "type": "stat", - "title": "GPU Pool Used", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 8, - "y": 21 - }, - "targets": [ - { - "expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "dark-green", - "value": null - }, - { - "color": "dark-yellow", - "value": 50 - }, - { - "color": "dark-orange", - "value": 75 - }, - { - "color": "dark-red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - }, - "decimals": 1 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-gpu dashboard", - "url": "/d/atlas-gpu", - "targetBlank": true - } - ], - "description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources." - }, - { - "id": 49, - "type": "stat", - "title": "GPU Active Devices", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 12, - "y": 21 - }, - "targets": [ - { - "expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))", - "refId": "A", - "legendFormat": "active", - "instant": true - }, - { - "expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))", - "refId": "B", - "legendFormat": "total", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "dark-green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "name_and_value" - }, - "links": [ - { - "title": "Open atlas-gpu dashboard", - "url": "/d/atlas-gpu", - "targetBlank": true - } - ], - "description": "Active GPU devices compared with total monitored GPU devices." - }, { "id": 11, "type": "piechart", @@ -3892,7 +3737,7 @@ data: }, "targets": [ { - "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", + "expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}", "instant": true @@ -3944,7 +3789,7 @@ data: "targetBlank": false } ], - "description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution." + "description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero." }, { "id": 13,