monitoring(gpu): clarify reservation accounting
This commit is contained in:
parent
39db0471d7
commit
d9955af899
@ -304,6 +304,10 @@ def gpu_requests_by_namespace_node(scope_var):
|
||||
)
|
||||
|
||||
|
||||
def gpu_requests_by_namespace(scope_var):
|
||||
return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})"
|
||||
|
||||
|
||||
def gpu_usage_by_namespace(scope_var):
|
||||
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||
@ -340,10 +344,10 @@ def namespace_ram_share_expr(scope_var):
|
||||
|
||||
|
||||
def namespace_gpu_share_expr(scope_var):
|
||||
usage = f"max_over_time(({namespace_gpu_usage_instant(scope_var)})[$__range:$__interval])"
|
||||
total = f"(sum({usage}) or on() vector(0))"
|
||||
present = f"(count({usage}) or on() vector(0))"
|
||||
share = f"100 * ({usage}) / clamp_min({total}, 1)"
|
||||
reservation = f"max_over_time(({gpu_requests_by_namespace(scope_var)})[$__range:$__interval])"
|
||||
total = f"(sum({reservation}) or on() vector(0))"
|
||||
present = f"(count({reservation}) or on() vector(0))"
|
||||
share = f"100 * ({reservation}) / clamp_min({total}, 1)"
|
||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)"
|
||||
return f"({share}) or ({idle})"
|
||||
|
||||
@ -1802,7 +1806,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||
"Namespace GPU Share": "GPU share by namespace in the selected scope; idle is good unless GPU work is expected.",
|
||||
"Namespace GPU Reservation": "GPU reservation share by namespace in the selected scope. This is allocation, not per-process GPU utilization.",
|
||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
||||
@ -2815,11 +2819,11 @@ def build_overview():
|
||||
panels.append(
|
||||
pie_panel(
|
||||
12,
|
||||
"Namespace GPU Share",
|
||||
"Namespace GPU Reservation",
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 23},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -5336,23 +5340,24 @@ def build_gpu_dashboard():
|
||||
panels.append(
|
||||
pie_panel(
|
||||
1,
|
||||
"Namespace GPU Share",
|
||||
"Namespace GPU Reservation",
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
||||
description="Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
2,
|
||||
"GPU Util by Namespace",
|
||||
"GPU Activity by Reservation",
|
||||
namespace_gpu_usage_instant(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
unit="percent",
|
||||
legend="{{namespace}}",
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -5370,11 +5375,12 @@ def build_gpu_dashboard():
|
||||
panels.append(
|
||||
table_panel(
|
||||
4,
|
||||
"Top Pods by GPU Util",
|
||||
"GPU Pods Reporting Device Util",
|
||||
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
||||
{"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
unit="percent",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
description="DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value.",
|
||||
)
|
||||
)
|
||||
return {
|
||||
|
||||
@ -154,9 +154,9 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
|
||||
assert "backup-telemetry-missing" in pvc_backup_expr
|
||||
assert 'pvc_backup_(count|last_success_timestamp_seconds|health_reason)' in pvc_backup_expr
|
||||
|
||||
gpu_expr = panels_by_title["Namespace GPU Share"]["targets"][0]["expr"]
|
||||
gpu_expr = panels_by_title["Namespace GPU Reservation"]["targets"][0]["expr"]
|
||||
assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr
|
||||
assert "/ on(node) group_left() clamp_min" in gpu_expr
|
||||
assert "sum by (namespace)" in gpu_expr
|
||||
assert "kube_node_status_allocatable" in gpu_expr
|
||||
assert "kube_node_labels" not in gpu_expr
|
||||
assert "count(max_over_time(" in gpu_expr
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
{
|
||||
"id": 1,
|
||||
"type": "piechart",
|
||||
"title": "Namespace GPU Share",
|
||||
"title": "Namespace GPU Reservation",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -71,12 +71,12 @@
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
||||
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "GPU Util by Namespace",
|
||||
"title": "GPU Activity by Reservation",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -108,7 +108,8 @@
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
@ -150,7 +151,7 @@
|
||||
{
|
||||
"id": 4,
|
||||
"type": "table",
|
||||
"title": "Top Pods by GPU Util",
|
||||
"title": "GPU Pods Reporting Device Util",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -185,7 +186,8 @@
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
],
|
||||
"description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -3715,7 +3715,7 @@
|
||||
{
|
||||
"id": 12,
|
||||
"type": "piechart",
|
||||
"title": "Namespace GPU Share",
|
||||
"title": "Namespace GPU Reservation",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -3728,7 +3728,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -3779,7 +3779,7 @@
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
||||
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
|
||||
@ -16,7 +16,7 @@ data:
|
||||
{
|
||||
"id": 1,
|
||||
"type": "piechart",
|
||||
"title": "Namespace GPU Share",
|
||||
"title": "Namespace GPU Reservation",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -80,12 +80,12 @@ data:
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
||||
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "GPU Util by Namespace",
|
||||
"title": "GPU Activity by Reservation",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -117,7 +117,8 @@ data:
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description": "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
@ -159,7 +160,7 @@ data:
|
||||
{
|
||||
"id": 4,
|
||||
"type": "table",
|
||||
"title": "Top Pods by GPU Util",
|
||||
"title": "GPU Pods Reporting Device Util",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -194,7 +195,8 @@ data:
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
],
|
||||
"description": "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -3724,7 +3724,7 @@ data:
|
||||
{
|
||||
"id": 12,
|
||||
"type": "piechart",
|
||||
"title": "Namespace GPU Share",
|
||||
"title": "Namespace GPU Reservation",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -3737,7 +3737,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(100 * (max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) / clamp_min((sum(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count(max_over_time((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -3788,7 +3788,7 @@ data:
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
||||
"description": "Shares are based on Kubernetes GPU requests seen in the selected range, not true per-process GPU utilization."
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user