monitoring(gpu): count monitored GPU pool devices
This commit is contained in:
parent
b367c6dea3
commit
d21b61f6d9
@ -352,38 +352,36 @@ def nvidia_process_gpu_usage_by_namespace(scope_var):
|
|||||||
return f"(({usage}) > 0)"
|
return f"(({usage}) > 0)"
|
||||||
|
|
||||||
|
|
||||||
def nvidia_process_gpu_present():
|
def nvidia_gpu_device_utilization():
|
||||||
return "(count(nvidia_gpu_device_utilization_percent) or on() vector(0))"
|
return "max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))"
|
||||||
|
|
||||||
|
|
||||||
|
def legacy_gpu_util_without_process_exporter():
|
||||||
|
return f"(({gpu_util_by_node()}) unless on(node) ({nvidia_gpu_device_utilization()}))"
|
||||||
|
|
||||||
|
|
||||||
def gpu_capacity_percent():
|
def gpu_capacity_percent():
|
||||||
process_capacity = "100 * count(nvidia_gpu_device_utilization_percent)"
|
process_count = f"(count({nvidia_gpu_device_utilization()}) or on() vector(0))"
|
||||||
legacy_capacity = (
|
legacy_count = f"(count({legacy_gpu_util_without_process_exporter()}) or on() vector(0))"
|
||||||
"100 * count("
|
return f"(100 * ({process_count} + {legacy_count}))"
|
||||||
f"{gpu_util_by_node()}"
|
|
||||||
") unless on() nvidia_gpu_device_utilization_percent"
|
|
||||||
)
|
|
||||||
return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))"
|
|
||||||
|
|
||||||
|
|
||||||
def gpu_active_devices_expr():
|
def gpu_active_devices_expr():
|
||||||
process_active = "sum(nvidia_gpu_device_utilization_percent > bool 0)"
|
process_active = f"(sum(({nvidia_gpu_device_utilization()}) > bool 0) or on() vector(0))"
|
||||||
legacy_active = f"sum(({gpu_util_by_node()}) > bool 0) unless on() nvidia_gpu_device_utilization_percent"
|
legacy_active = f"(sum(({legacy_gpu_util_without_process_exporter()}) > bool 0) or on() vector(0))"
|
||||||
return f"(({process_active}) or ({legacy_active}) or on() vector(0))"
|
return f"({process_active} + {legacy_active})"
|
||||||
|
|
||||||
|
|
||||||
def gpu_total_devices_expr():
|
def gpu_total_devices_expr():
|
||||||
process_total = "count(nvidia_gpu_device_utilization_percent)"
|
process_total = f"(count({nvidia_gpu_device_utilization()}) or on() vector(0))"
|
||||||
legacy_total = f"count({gpu_util_by_node()}) unless on() nvidia_gpu_device_utilization_percent"
|
legacy_total = f"(count({legacy_gpu_util_without_process_exporter()}) or on() vector(0))"
|
||||||
return f"(({process_total}) or ({legacy_total}) or on() vector(0))"
|
return f"({process_total} + {legacy_total})"
|
||||||
|
|
||||||
|
|
||||||
def unattributed_gpu_usage():
|
def unattributed_gpu_usage():
|
||||||
|
legacy_total = f"(sum({legacy_gpu_util_without_process_exporter()}) or on() vector(0))"
|
||||||
return (
|
return (
|
||||||
'label_replace((sum('
|
f'label_replace(({legacy_total} > 0), "namespace", "unattributed", "", "")'
|
||||||
f"{gpu_util_by_node()}"
|
|
||||||
') or on() vector(0)), "namespace", "unattributed", "", "") '
|
|
||||||
f"unless on() ({nvidia_process_gpu_present()} > 0)"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -1872,9 +1870,9 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
|||||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||||
"GPU Pool Used": "Current process-level GPU utilization across the monitored NVIDIA GPU pool.",
|
"GPU Pool Used": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources.",
|
||||||
"GPU Active Devices": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs.",
|
"GPU Active Devices": "Active GPU devices compared with total monitored GPU devices.",
|
||||||
"Namespace GPU Utilization": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.",
|
"Namespace GPU Utilization": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution.",
|
||||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||||
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
||||||
@ -2924,7 +2922,7 @@ def build_overview():
|
|||||||
namespace_gpu_share_expr(gpu_scope),
|
namespace_gpu_share_expr(gpu_scope),
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 23},
|
{"h": 9, "w": 8, "x": 8, "y": 23},
|
||||||
links=namespace_scope_links("namespace_scope_gpu"),
|
links=namespace_scope_links("namespace_scope_gpu"),
|
||||||
description="Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.",
|
description=OVERVIEW_PANEL_DESCRIPTIONS["Namespace GPU Utilization"],
|
||||||
instant=True,
|
instant=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -5446,7 +5444,7 @@ def build_gpu_dashboard():
|
|||||||
namespace_gpu_share_expr(gpu_scope),
|
namespace_gpu_share_expr(gpu_scope),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 0},
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
||||||
links=namespace_scope_links("namespace_scope_gpu"),
|
links=namespace_scope_links("namespace_scope_gpu"),
|
||||||
description="Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.",
|
description=OVERVIEW_PANEL_DESCRIPTIONS["Namespace GPU Utilization"],
|
||||||
instant=True,
|
instant=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -171,7 +171,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
|
|||||||
assert "nvidia_gpu_device_utilization_percent" in gpu_pool_expr
|
assert "nvidia_gpu_device_utilization_percent" in gpu_pool_expr
|
||||||
assert panels_by_title["GPU Pool Used"]["targets"][0]["instant"] is True
|
assert panels_by_title["GPU Pool Used"]["targets"][0]["instant"] is True
|
||||||
active_targets = panels_by_title["GPU Active Devices"]["targets"]
|
active_targets = panels_by_title["GPU Active Devices"]["targets"]
|
||||||
assert any("nvidia_gpu_device_utilization_percent > bool 0" in target["expr"] for target in active_targets)
|
assert any("nvidia_gpu_device_utilization_percent[5m]" in target["expr"] and "> bool 0" in target["expr"] for target in active_targets)
|
||||||
assert any("count(nvidia_gpu_device_utilization_percent)" in target["expr"] for target in active_targets)
|
assert any("count(nvidia_gpu_device_utilization_percent)" in target["expr"] for target in active_targets)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -72,7 +72,7 @@
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity."
|
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
@ -206,7 +206,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@
|
|||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
},
|
||||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
@ -277,13 +277,13 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "active",
|
"legendFormat": "active",
|
||||||
"instant": true
|
"instant": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
||||||
"refId": "B",
|
"refId": "B",
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -329,7 +329,7 @@
|
|||||||
},
|
},
|
||||||
"textMode": "name_and_value"
|
"textMode": "name_and_value"
|
||||||
},
|
},
|
||||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
"description": "Active GPU devices compared with total monitored GPU devices."
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -3659,7 +3659,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -3719,7 +3719,7 @@
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 49,
|
"id": 49,
|
||||||
@ -3737,13 +3737,13 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "active",
|
"legendFormat": "active",
|
||||||
"instant": true
|
"instant": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
||||||
"refId": "B",
|
"refId": "B",
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -3796,7 +3796,7 @@
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
"description": "Active GPU devices compared with total monitored GPU devices."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
@ -3883,7 +3883,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -3935,7 +3935,7 @@
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity."
|
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
|
|||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -81,7 +81,7 @@ data:
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity."
|
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
@ -215,7 +215,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -268,7 +268,7 @@ data:
|
|||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
},
|
||||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 6,
|
"id": 6,
|
||||||
@ -286,13 +286,13 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "active",
|
"legendFormat": "active",
|
||||||
"instant": true
|
"instant": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
||||||
"refId": "B",
|
"refId": "B",
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -338,7 +338,7 @@ data:
|
|||||||
},
|
},
|
||||||
"textMode": "name_and_value"
|
"textMode": "name_and_value"
|
||||||
},
|
},
|
||||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
"description": "Active GPU devices compared with total monitored GPU devices."
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -3668,7 +3668,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -3728,7 +3728,7 @@ data:
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 49,
|
"id": 49,
|
||||||
@ -3746,13 +3746,13 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "active",
|
"legendFormat": "active",
|
||||||
"instant": true
|
"instant": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
||||||
"refId": "B",
|
"refId": "B",
|
||||||
"legendFormat": "total",
|
"legendFormat": "total",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -3805,7 +3805,7 @@ data:
|
|||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
"description": "Active GPU devices compared with total monitored GPU devices."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
@ -3892,7 +3892,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -3944,7 +3944,7 @@ data:
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity."
|
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user