From 4e82df689188ccef10fc32236616cd6970baf69e Mon Sep 17 00:00:00 2001 From: jenkins Date: Thu, 21 May 2026 15:26:02 -0300 Subject: [PATCH] monitoring(gpu): show utilization with idle fallback --- scripts/dashboards_render_atlas.py | 52 +++++++++++++++---- scripts/tests/test_dashboards_render_atlas.py | 8 +-- services/monitoring/dashboards/atlas-gpu.json | 6 +-- .../monitoring/dashboards/atlas-overview.json | 6 +-- .../monitoring/grafana-dashboard-gpu.yaml | 6 +-- .../grafana-dashboard-overview.yaml | 6 +-- 6 files changed, 59 insertions(+), 25 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index f5202fb7..426baed3 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -308,6 +308,40 @@ def gpu_requests_by_namespace(scope_var): return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})" +def dcgm_gpu_util_metric(scope_var): + return f'DCGM_FI_DEV_GPU_UTIL{{pod!="",namespace!="",{scope_var}}}' + + +def dcgm_gpu_namespace_count_by_uuid(scope_var): + dcgm = dcgm_gpu_util_metric(scope_var) + return f"count by (UUID) (count by (UUID,namespace) ({dcgm}))" + + +def dcgm_gpu_utilization_by_namespace(scope_var): + dcgm = dcgm_gpu_util_metric(scope_var) + namespace_count = dcgm_gpu_namespace_count_by_uuid(scope_var) + unambiguous = ( + "sum by (namespace) (" + "avg_over_time((" + f"(max by (UUID,namespace) ({dcgm})) " + f"and on(UUID) ({namespace_count} == 1)" + ")[$__range:$__interval])" + ")" + ) + shared = ( + 'label_replace(sum(avg_over_time((' + f"(max by (UUID) ({dcgm})) " + f"and on(UUID) ({namespace_count} > 1)" + ')[$__range:$__interval])), "namespace", "shared", "", "")' + ) + return f"({unambiguous}) or ({shared})" + + +def dcgm_gpu_utilization_present(scope_var): + dcgm = dcgm_gpu_util_metric(scope_var) + return f"(sum(max_over_time((max by (UUID) ({dcgm}))[$__range:$__interval])) or on() vector(0))" + + def gpu_usage_by_namespace(scope_var): requests_by_ns = gpu_requests_by_namespace_node(scope_var) total_by_node = f"sum by (node) ({requests_by_ns})" @@ -344,10 +378,10 @@ def namespace_ram_share_expr(scope_var): def namespace_gpu_share_expr(scope_var): - reservation = f"max_over_time(({gpu_requests_by_namespace(scope_var)})[$__range:$__interval])" - total = f"(sum({reservation}) or on() vector(0))" - present = f"(count({reservation}) or on() vector(0))" - share = f"100 * ({reservation}) / clamp_min({total}, 1)" + utilization = dcgm_gpu_utilization_by_namespace(scope_var) + total = f"(sum({utilization}) or on() vector(0))" + present = dcgm_gpu_utilization_present(scope_var) + share = f"100 * ({utilization}) / clamp_min({total}, 1)" idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)" return f"({share}) or ({idle})" @@ -1806,7 +1840,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.", "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.", "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.", - "Namespace GPU Reservation": "GPU reservation share by namespace in the selected scope. This is allocation, not per-process GPU utilization.", + "Namespace GPU Utilization": "Measured GPU activity share by namespace in the selected scope. Ambiguous shared-device activity is grouped as shared; idle appears only when utilization is zero.", "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.", "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.", "Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.", @@ -2819,11 +2853,11 @@ def build_overview(): panels.append( pie_panel( 12, - "Namespace GPU Reservation", + "Namespace GPU Utilization", namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 23}, links=namespace_scope_links("namespace_scope_gpu"), - description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.", + description="Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero.", ) ) panels.append( @@ -5340,11 +5374,11 @@ def build_gpu_dashboard(): panels.append( pie_panel( 1, - "Namespace GPU Reservation", + "Namespace GPU Utilization", namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, links=namespace_scope_links("namespace_scope_gpu"), - description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.", + description="Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero.", ) ) panels.append( diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 8dfc3575..918449bf 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -154,12 +154,12 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert "backup-telemetry-missing" in pvc_backup_expr assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr - gpu_expr = panels_by_title["Namespace GPU Reservation"]["targets"][0]["expr"] - assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr + gpu_expr = panels_by_title["Namespace GPU Utilization"]["targets"][0]["expr"] + assert "DCGM_FI_DEV_GPU_UTIL" in gpu_expr assert "sum by (namespace)" in gpu_expr - assert "kube_node_status_allocatable" in gpu_expr + assert 'namespace", "shared"' in gpu_expr assert "kube_node_labels" not in gpu_expr - assert "count(max_over_time(" in gpu_expr + assert "sum(max_over_time(" in gpu_expr assert 'namespace", "idle"' in gpu_expr diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 79416381..3b6fdfdb 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -7,7 +7,7 @@ { "id": 1, "type": "piechart", - "title": "Namespace GPU Reservation", + "title": "Namespace GPU Utilization", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -71,7 +71,7 @@ "targetBlank": false } ], - "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." + "description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero." }, { "id": 2, diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ba70cf27..71d36424 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -3715,7 +3715,7 @@ { "id": 12, "type": "piechart", - "title": "Namespace GPU Reservation", + "title": "Namespace GPU Utilization", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3728,7 +3728,7 @@ }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -3779,7 +3779,7 @@ "targetBlank": false } ], - "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." + "description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero." }, { "id": 13, diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 5b8ca9ff..06dd4d5a 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "piechart", - "title": "Namespace GPU Reservation", + "title": "Namespace GPU Utilization", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -80,7 +80,7 @@ data: "targetBlank": false } ], - "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." + "description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero." }, { "id": 2, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 9a3c8eaf..3fabf36e 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -3724,7 +3724,7 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace GPU Reservation", + "title": "Namespace GPU Utilization", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3737,7 +3737,7 @@ data: }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -3788,7 +3788,7 @@ data: "targetBlank": false } ], - "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." + "description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero." }, { "id": 13,