diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 3b788ff5..f5202fb7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -304,6 +304,10 @@ def gpu_requests_by_namespace_node(scope_var): ) +def gpu_requests_by_namespace(scope_var): + return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})" + + def gpu_usage_by_namespace(scope_var): requests_by_ns = gpu_requests_by_namespace_node(scope_var) total_by_node = f"sum by (node) ({requests_by_ns})" @@ -340,10 +344,10 @@ def namespace_ram_share_expr(scope_var): def namespace_gpu_share_expr(scope_var): - usage = f"max_over_time(({namespace_gpu_usage_instant(scope_var)})[$__range:$__interval])" - total = f"(sum({usage}) or on() vector(0))" - present = f"(count({usage}) or on() vector(0))" - share = f"100 * ({usage}) / clamp_min({total}, 1)" + reservation = f"max_over_time(({gpu_requests_by_namespace(scope_var)})[$__range:$__interval])" + total = f"(sum({reservation}) or on() vector(0))" + present = f"(count({reservation}) or on() vector(0))" + share = f"100 * ({reservation}) / clamp_min({total}, 1)" idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)" return f"({share}) or ({idle})" @@ -1802,7 +1806,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.", "Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.", "Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.", - "Namespace GPU Share": "GPU share by namespace in the selected scope; idle is good unless GPU work is expected.", + "Namespace GPU Reservation": "GPU reservation share by namespace in the selected scope. This is allocation, not per-process GPU utilization.", "Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.", "Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.", "Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.", @@ -2815,11 +2819,11 @@ def build_overview(): panels.append( pie_panel( 12, - "Namespace GPU Share", + "Namespace GPU Reservation", namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 23}, links=namespace_scope_links("namespace_scope_gpu"), - description="Shares are normalized within the selected filter. Switching scope changes the denominator.", + description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.", ) ) panels.append( @@ -5336,23 +5340,24 @@ def build_gpu_dashboard(): panels.append( pie_panel( 1, - "Namespace GPU Share", + "Namespace GPU Reservation", namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, links=namespace_scope_links("namespace_scope_gpu"), - description="Shares are normalized within the selected filter. Switching scope changes the denominator.", + description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.", ) ) panels.append( timeseries_panel( 2, - "GPU Util by Namespace", + "GPU Activity by Reservation", namespace_gpu_usage_instant(gpu_scope), {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", legend_display="table", legend_placement="right", + description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.", ) ) panels.append( @@ -5370,11 +5375,12 @@ def build_gpu_dashboard(): panels.append( table_panel( 4, - "Top Pods by GPU Util", + "GPU Pods Reporting Device Util", 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', {"h": 8, "w": 12, "x": 12, "y": 8}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}], + description="DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value.", ) ) return { diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index eac67c62..8dfc3575 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -154,9 +154,9 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert "backup-telemetry-missing" in pvc_backup_expr assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr - gpu_expr = panels_by_title["Namespace GPU Share"]["targets"][0]["expr"] + gpu_expr = panels_by_title["Namespace GPU Reservation"]["targets"][0]["expr"] assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr - assert "/ on(node) group_left() clamp_min" in gpu_expr + assert "sum by (namespace)" in gpu_expr assert "kube_node_status_allocatable" in gpu_expr assert "kube_node_labels" not in gpu_expr assert "count(max_over_time(" in gpu_expr diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index b1c9e58d..79416381 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -7,7 +7,7 @@ { "id": 1, "type": "piechart", - "title": "Namespace GPU Share", + "title": "Namespace GPU Reservation", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -71,12 +71,12 @@ "targetBlank": false } ], - "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." + "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." }, { "id": 2, "type": "timeseries", - "title": "GPU Util by Namespace", + "title": "GPU Activity by Reservation", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -108,7 +108,8 @@ "tooltip": { "mode": "multi" } - } + }, + "description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node." }, { "id": 3, @@ -150,7 +151,7 @@ { "id": 4, "type": "table", - "title": "Top Pods by GPU Util", + "title": "GPU Pods Reporting Device Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -185,7 +186,8 @@ "id": "labelsToFields", "options": {} } - ] + ], + "description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 15dd6474..ba70cf27 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -3715,7 +3715,7 @@ { "id": 12, "type": "piechart", - "title": "Namespace GPU Share", + "title": "Namespace GPU Reservation", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3728,7 +3728,7 @@ }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -3779,7 +3779,7 @@ "targetBlank": false } ], - "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." + "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." }, { "id": 13, diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 8d8f13dc..5b8ca9ff 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "piechart", - "title": "Namespace GPU Share", + "title": "Namespace GPU Reservation", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -80,12 +80,12 @@ data: "targetBlank": false } ], - "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." + "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." }, { "id": 2, "type": "timeseries", - "title": "GPU Util by Namespace", + "title": "GPU Activity by Reservation", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -117,7 +117,8 @@ data: "tooltip": { "mode": "multi" } - } + }, + "description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node." }, { "id": 3, @@ -159,7 +160,7 @@ data: { "id": 4, "type": "table", - "title": "Top Pods by GPU Util", + "title": "GPU Pods Reporting Device Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -194,7 +195,8 @@ data: "id": "labelsToFields", "options": {} } - ] + ], + "description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 648efa66..9a3c8eaf 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -3724,7 +3724,7 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace GPU Share", + "title": "Namespace GPU Reservation", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3737,7 +3737,7 @@ data: }, "targets": [ { - "expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))", + "expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -3788,7 +3788,7 @@ data: "targetBlank": false } ], - "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." + "description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization." }, { "id": 13,