monitoring(gpu): show activity share by namespace
This commit is contained in:
parent
ec972a52f1
commit
5e27384ea2
@ -395,22 +395,14 @@ def gpu_pool_used_expr(scope_var):
|
|||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr(scope_var):
|
def namespace_gpu_share_expr(scope_var):
|
||||||
utilization_raw = gpu_utilization_raw(scope_var)
|
activity = gpu_utilization_raw(scope_var)
|
||||||
total_raw = f"(sum({utilization_raw}) or on() vector(0))"
|
total = f"(sum({activity}) or on() vector(0))"
|
||||||
capacity = gpu_capacity_percent()
|
share = f"100 * ({activity}) / clamp_min({total}, 1)"
|
||||||
utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)"
|
|
||||||
total = f"(sum({utilization}) or on() vector(0))"
|
|
||||||
unused = (
|
|
||||||
'label_replace(clamp_min('
|
|
||||||
f"100 - {total}"
|
|
||||||
', 0), "namespace", "unused", "", "") '
|
|
||||||
f"and on() ({total_raw} > 0)"
|
|
||||||
)
|
|
||||||
idle = (
|
idle = (
|
||||||
'label_replace(vector(100), "namespace", "idle", "", "") '
|
'label_replace(vector(100), "namespace", "idle", "", "") '
|
||||||
f"and on() ({total_raw} == 0)"
|
f"and on() ({total} == 0)"
|
||||||
)
|
)
|
||||||
return f"({utilization}) or ({unused}) or ({idle})"
|
return f"({share}) or ({idle})"
|
||||||
|
|
||||||
|
|
||||||
PROBLEM_PODS_EXPR = (
|
PROBLEM_PODS_EXPR = (
|
||||||
@ -1870,9 +1862,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
|||||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||||
"GPU Pool Used": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources.",
|
"Namespace GPU Utilization": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero.",
|
||||||
"GPU Active Devices": "Active GPU devices compared with total monitored GPU devices.",
|
|
||||||
"Namespace GPU Utilization": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution.",
|
|
||||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||||
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
||||||
@ -2872,39 +2862,6 @@ def build_overview():
|
|||||||
gpu_scope = "$namespace_scope_gpu"
|
gpu_scope = "$namespace_scope_gpu"
|
||||||
ram_scope = "$namespace_scope_ram"
|
ram_scope = "$namespace_scope_ram"
|
||||||
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
48,
|
|
||||||
"GPU Pool Used",
|
|
||||||
gpu_pool_used_expr(gpu_scope),
|
|
||||||
{"h": 2, "w": 4, "x": 8, "y": 21},
|
|
||||||
unit="percent",
|
|
||||||
decimals=1,
|
|
||||||
instant=True,
|
|
||||||
thresholds=PERCENT_THRESHOLDS,
|
|
||||||
links=overview_link("atlas-gpu"),
|
|
||||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
49,
|
|
||||||
"GPU Active Devices",
|
|
||||||
"",
|
|
||||||
{"h": 2, "w": 4, "x": 12, "y": 21},
|
|
||||||
unit="none",
|
|
||||||
decimals=0,
|
|
||||||
text_mode="name_and_value",
|
|
||||||
instant=True,
|
|
||||||
targets=[
|
|
||||||
{"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
|
|
||||||
{"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
|
|
||||||
],
|
|
||||||
links=overview_link("atlas-gpu"),
|
|
||||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
11,
|
11,
|
||||||
@ -5484,36 +5441,6 @@ def build_gpu_dashboard():
|
|||||||
description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.",
|
description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
5,
|
|
||||||
"GPU Pool Used",
|
|
||||||
gpu_pool_used_expr(gpu_scope),
|
|
||||||
{"h": 3, "w": 6, "x": 0, "y": 16},
|
|
||||||
unit="percent",
|
|
||||||
decimals=1,
|
|
||||||
instant=True,
|
|
||||||
thresholds=PERCENT_THRESHOLDS,
|
|
||||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
stat_panel(
|
|
||||||
6,
|
|
||||||
"GPU Active Devices",
|
|
||||||
"",
|
|
||||||
{"h": 3, "w": 6, "x": 6, "y": 16},
|
|
||||||
unit="none",
|
|
||||||
decimals=0,
|
|
||||||
text_mode="name_and_value",
|
|
||||||
instant=True,
|
|
||||||
targets=[
|
|
||||||
{"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
|
|
||||||
{"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
|
|
||||||
],
|
|
||||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-gpu",
|
"uid": "atlas-gpu",
|
||||||
"title": "Atlas GPU",
|
"title": "Atlas GPU",
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -72,7 +72,7 @@
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
"description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
@ -189,147 +189,6 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
|
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 5,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Pool Used",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 3,
|
|
||||||
"w": 6,
|
|
||||||
"x": 0,
|
|
||||||
"y": 16
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
|
||||||
"refId": "A",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-yellow",
|
|
||||||
"value": 50
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-orange",
|
|
||||||
"value": 75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-red",
|
|
||||||
"value": 91.5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 1
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
},
|
|
||||||
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 6,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Active Devices",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 3,
|
|
||||||
"w": 6,
|
|
||||||
"x": 6,
|
|
||||||
"y": 16
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "active",
|
|
||||||
"instant": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "total",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "rgba(115, 115, 115, 1)",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "none",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 0
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "name_and_value"
|
|
||||||
},
|
|
||||||
"description": "Active GPU devices compared with total monitored GPU devices."
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -3643,161 +3643,6 @@
|
|||||||
},
|
},
|
||||||
"description": "Database with the most active connections; high values identify the pressure source."
|
"description": "Database with the most active connections; high values identify the pressure source."
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": 48,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Pool Used",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 2,
|
|
||||||
"w": 4,
|
|
||||||
"x": 8,
|
|
||||||
"y": 21
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
|
||||||
"refId": "A",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-yellow",
|
|
||||||
"value": 50
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-orange",
|
|
||||||
"value": 75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-red",
|
|
||||||
"value": 91.5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 1
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
},
|
|
||||||
"links": [
|
|
||||||
{
|
|
||||||
"title": "Open atlas-gpu dashboard",
|
|
||||||
"url": "/d/atlas-gpu",
|
|
||||||
"targetBlank": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 49,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Active Devices",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 2,
|
|
||||||
"w": 4,
|
|
||||||
"x": 12,
|
|
||||||
"y": 21
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "active",
|
|
||||||
"instant": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "total",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "rgba(115, 115, 115, 1)",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "none",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 0
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "name_and_value"
|
|
||||||
},
|
|
||||||
"links": [
|
|
||||||
{
|
|
||||||
"title": "Open atlas-gpu dashboard",
|
|
||||||
"url": "/d/atlas-gpu",
|
|
||||||
"targetBlank": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Active GPU devices compared with total monitored GPU devices."
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
@ -3883,7 +3728,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -3935,7 +3780,7 @@
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
"description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
|
|||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -81,7 +81,7 @@ data:
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
"description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 2,
|
||||||
@ -198,147 +198,6 @@ data:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
|
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 5,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Pool Used",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 3,
|
|
||||||
"w": 6,
|
|
||||||
"x": 0,
|
|
||||||
"y": 16
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
|
||||||
"refId": "A",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-yellow",
|
|
||||||
"value": 50
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-orange",
|
|
||||||
"value": 75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-red",
|
|
||||||
"value": 91.5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 1
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
},
|
|
||||||
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 6,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Active Devices",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 3,
|
|
||||||
"w": 6,
|
|
||||||
"x": 6,
|
|
||||||
"y": 16
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "active",
|
|
||||||
"instant": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "total",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "rgba(115, 115, 115, 1)",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "none",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 0
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "name_and_value"
|
|
||||||
},
|
|
||||||
"description": "Active GPU devices compared with total monitored GPU devices."
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -3652,161 +3652,6 @@ data:
|
|||||||
},
|
},
|
||||||
"description": "Database with the most active connections; high values identify the pressure source."
|
"description": "Database with the most active connections; high values identify the pressure source."
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": 48,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Pool Used",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 2,
|
|
||||||
"w": 4,
|
|
||||||
"x": 8,
|
|
||||||
"y": 21
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)",
|
|
||||||
"refId": "A",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-yellow",
|
|
||||||
"value": 50
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-orange",
|
|
||||||
"value": 75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-red",
|
|
||||||
"value": 91.5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 1
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "value"
|
|
||||||
},
|
|
||||||
"links": [
|
|
||||||
{
|
|
||||||
"title": "Open atlas-gpu dashboard",
|
|
||||||
"url": "/d/atlas-gpu",
|
|
||||||
"targetBlank": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 49,
|
|
||||||
"type": "stat",
|
|
||||||
"title": "GPU Active Devices",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 2,
|
|
||||||
"w": 4,
|
|
||||||
"x": 12,
|
|
||||||
"y": 21
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "active",
|
|
||||||
"instant": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"expr": "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))",
|
|
||||||
"refId": "B",
|
|
||||||
"legendFormat": "total",
|
|
||||||
"instant": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {
|
|
||||||
"mode": "thresholds"
|
|
||||||
},
|
|
||||||
"mappings": [],
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"color": "rgba(115, 115, 115, 1)",
|
|
||||||
"value": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"color": "dark-green",
|
|
||||||
"value": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "none",
|
|
||||||
"custom": {
|
|
||||||
"displayMode": "auto"
|
|
||||||
},
|
|
||||||
"decimals": 0
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"colorMode": "value",
|
|
||||||
"graphMode": "area",
|
|
||||||
"justifyMode": "center",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
},
|
|
||||||
"textMode": "name_and_value"
|
|
||||||
},
|
|
||||||
"links": [
|
|
||||||
{
|
|
||||||
"title": "Open atlas-gpu dashboard",
|
|
||||||
"url": "/d/atlas-gpu",
|
|
||||||
"targetBlank": true
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Active GPU devices compared with total monitored GPU devices."
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
@ -3892,7 +3737,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
"expr": "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}",
|
"legendFormat": "{{namespace}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -3944,7 +3789,7 @@ data:
|
|||||||
"targetBlank": false
|
"targetBlank": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
|
"description": "Instant share of observed GPU compute activity by namespace. Host covers GPU work outside Kubernetes pods; idle appears only when observed GPU activity is zero."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user