monitoring(gpu): attribute utilization to namespaces
This commit is contained in:
parent
26af225f06
commit
72e4dcd84b
@ -308,40 +308,6 @@ def gpu_requests_by_namespace(scope_var):
|
||||
return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})"
|
||||
|
||||
|
||||
def dcgm_gpu_util_metric(scope_var):
|
||||
return f'DCGM_FI_DEV_GPU_UTIL{{pod!="",namespace!="",{scope_var}}}'
|
||||
|
||||
|
||||
def dcgm_gpu_namespace_count_by_uuid(scope_var):
|
||||
dcgm = dcgm_gpu_util_metric(scope_var)
|
||||
return f"count by (UUID) (count by (UUID,namespace) ({dcgm}))"
|
||||
|
||||
|
||||
def dcgm_gpu_utilization_by_namespace(scope_var):
|
||||
dcgm = dcgm_gpu_util_metric(scope_var)
|
||||
namespace_count = dcgm_gpu_namespace_count_by_uuid(scope_var)
|
||||
unambiguous = (
|
||||
"sum by (namespace) ("
|
||||
"avg_over_time(("
|
||||
f"(max by (UUID,namespace) ({dcgm})) "
|
||||
f"and on(UUID) ({namespace_count} == 1)"
|
||||
")[$__range:$__interval])"
|
||||
")"
|
||||
)
|
||||
shared = (
|
||||
'label_replace(sum(avg_over_time(('
|
||||
f"(max by (UUID) ({dcgm})) "
|
||||
f"and on(UUID) ({namespace_count} > 1)"
|
||||
')[$__range:$__interval])), "namespace", "shared", "", "")'
|
||||
)
|
||||
return f"({unambiguous}) or ({shared})"
|
||||
|
||||
|
||||
def dcgm_gpu_utilization_present(scope_var):
|
||||
dcgm = dcgm_gpu_util_metric(scope_var)
|
||||
return f"(sum(max_over_time((max by (UUID) ({dcgm}))[$__range:$__interval])) or on() vector(0))"
|
||||
|
||||
|
||||
def gpu_usage_by_namespace(scope_var):
|
||||
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
||||
total_by_node = f"sum by (node) ({requests_by_ns})"
|
||||
@ -377,13 +343,21 @@ def namespace_ram_share_expr(scope_var):
|
||||
return namespace_share_expr(namespace_ram_raw(scope_var))
|
||||
|
||||
|
||||
def current_gpu_claim_count(scope_var):
|
||||
requests_by_ns = gpu_requests_by_namespace(scope_var)
|
||||
return f"(count(({requests_by_ns}) > 0) or on() vector(0))"
|
||||
|
||||
|
||||
def namespace_gpu_share_expr(scope_var):
|
||||
utilization = dcgm_gpu_utilization_by_namespace(scope_var)
|
||||
total = f"(sum({utilization}) or on() vector(0))"
|
||||
present = dcgm_gpu_utilization_present(scope_var)
|
||||
share = f"100 * ({utilization}) / clamp_min({total}, 1)"
|
||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)"
|
||||
return f"({share}) or ({idle})"
|
||||
utilization = (
|
||||
f"avg_over_time(({gpu_usage_by_namespace(scope_var)})[$__range:$__interval]) "
|
||||
f"and on(namespace) (({gpu_requests_by_namespace(scope_var)}) > 0)"
|
||||
)
|
||||
idle = (
|
||||
'label_replace(vector(100), "namespace", "idle", "", "") '
|
||||
f"and on() ({current_gpu_claim_count(scope_var)} == 0)"
|
||||
)
|
||||
return f"({utilization}) or ({idle})"
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = (
|
||||
@ -1840,7 +1814,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||
"Namespace GPU Utilization": "Measured GPU activity share by namespace in the selected scope. Ambiguous shared-device activity is grouped as shared; idle appears only when utilization is zero.",
|
||||
"Namespace GPU Utilization": "Measured GPU utilization attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.",
|
||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
||||
@ -2857,7 +2831,7 @@ def build_overview():
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 23},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero.",
|
||||
description="Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -5378,7 +5352,7 @@ def build_gpu_dashboard():
|
||||
namespace_gpu_share_expr(gpu_scope),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
links=namespace_scope_links("namespace_scope_gpu"),
|
||||
description="Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero.",
|
||||
description="Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
|
||||
@ -157,9 +157,11 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
|
||||
gpu_expr = panels_by_title["Namespace GPU Utilization"]["targets"][0]["expr"]
|
||||
assert "DCGM_FI_DEV_GPU_UTIL" in gpu_expr
|
||||
assert "sum by (namespace)" in gpu_expr
|
||||
assert 'namespace", "shared"' in gpu_expr
|
||||
assert 'namespace", "shared"' not in gpu_expr
|
||||
assert "kube_node_labels" not in gpu_expr
|
||||
assert "sum(max_over_time(" in gpu_expr
|
||||
assert "avg_over_time(" in gpu_expr
|
||||
assert 'resource=~"nvidia(_com_|[.]com/)gpu.*"' in gpu_expr
|
||||
assert "and on(namespace)" in gpu_expr
|
||||
assert 'namespace", "idle"' in gpu_expr
|
||||
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -71,7 +71,7 @@
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero."
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
|
||||
@ -3728,7 +3728,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -3779,7 +3779,7 @@
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero."
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -80,7 +80,7 @@ data:
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero."
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
|
||||
@ -3737,7 +3737,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))",
|
||||
"expr": "(avg_over_time((sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))))[$__range:$__interval]) and on(namespace) ((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((count((sum by (namespace) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0))))) > 0) or on() vector(0)) == 0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -3788,7 +3788,7 @@ data:
|
||||
"targetBlank": false
|
||||
}
|
||||
],
|
||||
"description": "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero."
|
||||
"description": "Measured GPU utilization is attributed to namespaces with current GPU claims. Idle appears only when no namespace in scope currently claims a GPU."
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user