monitoring(gpu): clarify reservation accounting

This commit is contained in:
jenkins 2026-05-21 13:04:26 -03:00
parent 39db0471d7
commit d9955af899
6 changed files with 43 additions and 33 deletions

View File

@ -304,6 +304,10 @@ def gpu_requests_by_namespace_node(scope_var):
)
def gpu_requests_by_namespace(scope_var):
return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})"
def gpu_usage_by_namespace(scope_var):
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
@ -340,10 +344,10 @@ def namespace_ram_share_expr(scope_var):
def namespace_gpu_share_expr(scope_var):
usage = f"max_over_time(({namespace_gpu_usage_instant(scope_var)})[$__range:$__interval])"
total = f"(sum({usage}) or on() vector(0))"
present = f"(count({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
reservation = f"max_over_time(({gpu_requests_by_namespace(scope_var)})[$__range:$__interval])"
total = f"(sum({reservation}) or on() vector(0))"
present = f"(count({reservation}) or on() vector(0))"
share = f"100 * ({reservation}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)"
return f"({share}) or ({idle})"
@ -1802,7 +1806,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
"Namespace GPU Share": "GPU share by namespace in the selected scope; idle is good unless GPU work is expected.",
"Namespace GPU Reservation": "GPU reservation share by namespace in the selected scope. This is allocation, not per-process GPU utilization.",
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
@ -2815,11 +2819,11 @@ def build_overview():
panels.append(
pie_panel(
12,
"Namespace GPU Share",
"Namespace GPU Reservation",
namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 23},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.",
)
)
panels.append(
@ -5336,23 +5340,24 @@ def build_gpu_dashboard():
panels.append(
pie_panel(
1,
"Namespace GPU Share",
"Namespace GPU Reservation",
namespace_gpu_share_expr(gpu_scope),
{"h": 8, "w": 12, "x": 0, "y": 0},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.",
)
)
panels.append(
timeseries_panel(
2,
"GPU Util by Namespace",
"GPU Activity by Reservation",
namespace_gpu_usage_instant(gpu_scope),
{"h": 8, "w": 12, "x": 12, "y": 0},
unit="percent",
legend="{{namespace}}",
legend_display="table",
legend_placement="right",
description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.",
)
)
panels.append(
@ -5370,11 +5375,12 @@ def build_gpu_dashboard():
panels.append(
table_panel(
4,
"Top Pods by GPU Util",
"GPU Pods Reporting Device Util",
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
{"h": 8, "w": 12, "x": 12, "y": 8},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
description="DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value.",
)
)
return {

View File

@ -154,9 +154,9 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
assert "backup-telemetry-missing" in pvc_backup_expr
assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr
gpu_expr = panels_by_title["Namespace GPU Share"]["targets"][0]["expr"]
gpu_expr = panels_by_title["Namespace GPU Reservation"]["targets"][0]["expr"]
assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr
assert "/ on(node) group_left() clamp_min" in gpu_expr
assert "sum by (namespace)" in gpu_expr
assert "kube_node_status_allocatable" in gpu_expr
assert "kube_node_labels" not in gpu_expr
assert "count(max_over_time(" in gpu_expr

View File

@ -7,7 +7,7 @@
{
"id": 1,
"type": "piechart",
"title": "Namespace GPU Share",
"title": "Namespace GPU Reservation",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -71,12 +71,12 @@
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
},
{
"id": 2,
"type": "timeseries",
"title": "GPU Util by Namespace",
"title": "GPU Activity by Reservation",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -108,7 +108,8 @@
"tooltip": {
"mode": "multi"
}
}
},
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
},
{
"id": 3,
@ -150,7 +151,7 @@
{
"id": 4,
"type": "table",
"title": "Top Pods by GPU Util",
"title": "GPU Pods Reporting Device Util",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -185,7 +186,8 @@
"id": "labelsToFields",
"options": {}
}
]
],
"description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
}
],
"time": {

View File

@ -3715,7 +3715,7 @@
{
"id": 12,
"type": "piechart",
"title": "Namespace GPU Share",
"title": "Namespace GPU Reservation",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3728,7 +3728,7 @@
},
"targets": [
{
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -3779,7 +3779,7 @@
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
},
{
"id": 13,

View File

@ -16,7 +16,7 @@ data:
{
"id": 1,
"type": "piechart",
"title": "Namespace GPU Share",
"title": "Namespace GPU Reservation",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -80,12 +80,12 @@ data:
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
},
{
"id": 2,
"type": "timeseries",
"title": "GPU Util by Namespace",
"title": "GPU Activity by Reservation",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -117,7 +117,8 @@ data:
"tooltip": {
"mode": "multi"
}
}
},
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
},
{
"id": 3,
@ -159,7 +160,7 @@ data:
{
"id": 4,
"type": "table",
"title": "Top Pods by GPU Util",
"title": "GPU Pods Reporting Device Util",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -194,7 +195,8 @@ data:
"id": "labelsToFields",
"options": {}
}
]
],
"description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
}
],
"time": {

View File

@ -3724,7 +3724,7 @@ data:
{
"id": 12,
"type": "piechart",
"title": "Namespace GPU Share",
"title": "Namespace GPU Reservation",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3737,7 +3737,7 @@ data:
},
"targets": [
{
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -3788,7 +3788,7 @@ data:
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
},
{
"id": 13,