monitoring(gpu): add pool utilization counters
This commit is contained in:
parent
4ce5a67b94
commit
6388ef5c6d
@ -366,6 +366,18 @@ def gpu_capacity_percent():
|
||||
return f"(({process_capacity}) or ({legacy_capacity}) or on() vector(0))"
|
||||
|
||||
|
||||
def gpu_active_devices_expr():
|
||||
process_active = "sum(nvidia_gpu_device_utilization_percent > bool 0)"
|
||||
legacy_active = f"sum(({gpu_util_by_node()}) > bool 0) unless on() nvidia_gpu_device_utilization_percent"
|
||||
return f"(({process_active}) or ({legacy_active}) or on() vector(0))"
|
||||
|
||||
|
||||
def gpu_total_devices_expr():
|
||||
process_total = "count(nvidia_gpu_device_utilization_percent)"
|
||||
legacy_total = f"count({gpu_util_by_node()}) unless on() nvidia_gpu_device_utilization_percent"
|
||||
return f"(({process_total}) or ({legacy_total}) or on() vector(0))"
|
||||
|
||||
|
||||
def unattributed_gpu_usage():
|
||||
return (
|
||||
'label_replace((sum('
|
||||
@ -375,8 +387,17 @@ def unattributed_gpu_usage():
|
||||
)
|
||||
|
||||
|
||||
def gpu_utilization_raw(scope_var):
|
||||
return f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
|
||||
|
||||
|
||||
def gpu_pool_used_expr(scope_var):
|
||||
raw_total = f"(sum({gpu_utilization_raw(scope_var)}) or on() vector(0))"
|
||||
return f"100 * {raw_total} / clamp_min({gpu_capacity_percent()}, 1)"
|
||||
|
||||
|
||||
def namespace_gpu_share_expr(scope_var):
|
||||
utilization_raw = f"({nvidia_process_gpu_usage_by_namespace(scope_var)}) or ({unattributed_gpu_usage()})"
|
||||
utilization_raw = gpu_utilization_raw(scope_var)
|
||||
total_raw = f"(sum({utilization_raw}) or on() vector(0))"
|
||||
capacity = gpu_capacity_percent()
|
||||
utilization = f"100 * ({utilization_raw}) / clamp_min({capacity}, 1)"
|
||||
@ -1851,6 +1872,8 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
|
||||
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
||||
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
||||
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
||||
"GPU Pool Used": "Current process-level GPU utilization across the monitored NVIDIA GPU pool.",
|
||||
"GPU Active Devices": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs.",
|
||||
"Namespace GPU Utilization": "Instant NVIDIA process-level GPU utilization normalized to the monitored GPU pool. Host covers non-Kubernetes processes; unused fills remaining capacity; idle appears only at zero activity.",
|
||||
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
||||
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
||||
@ -2851,6 +2874,39 @@ def build_overview():
|
||||
gpu_scope = "$namespace_scope_gpu"
|
||||
ram_scope = "$namespace_scope_ram"
|
||||
|
||||
panels.append(
|
||||
stat_panel(
|
||||
48,
|
||||
"GPU Pool Used",
|
||||
gpu_pool_used_expr(gpu_scope),
|
||||
{"h": 2, "w": 4, "x": 8, "y": 21},
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
instant=True,
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
links=overview_link("atlas-gpu"),
|
||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
49,
|
||||
"GPU Active Devices",
|
||||
"",
|
||||
{"h": 2, "w": 4, "x": 12, "y": 21},
|
||||
unit="none",
|
||||
decimals=0,
|
||||
text_mode="name_and_value",
|
||||
instant=True,
|
||||
targets=[
|
||||
{"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
|
||||
{"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
|
||||
],
|
||||
links=overview_link("atlas-gpu"),
|
||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
pie_panel(
|
||||
11,
|
||||
@ -5430,6 +5486,36 @@ def build_gpu_dashboard():
|
||||
description="DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value.",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
5,
|
||||
"GPU Pool Used",
|
||||
gpu_pool_used_expr(gpu_scope),
|
||||
{"h": 3, "w": 6, "x": 0, "y": 16},
|
||||
unit="percent",
|
||||
decimals=1,
|
||||
instant=True,
|
||||
thresholds=PERCENT_THRESHOLDS,
|
||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Pool Used"],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
6,
|
||||
"GPU Active Devices",
|
||||
"",
|
||||
{"h": 3, "w": 6, "x": 6, "y": 16},
|
||||
unit="none",
|
||||
decimals=0,
|
||||
text_mode="name_and_value",
|
||||
instant=True,
|
||||
targets=[
|
||||
{"expr": gpu_active_devices_expr(), "refId": "A", "legendFormat": "active"},
|
||||
{"expr": gpu_total_devices_expr(), "refId": "B", "legendFormat": "total"},
|
||||
],
|
||||
description=OVERVIEW_PANEL_DESCRIPTIONS["GPU Active Devices"],
|
||||
)
|
||||
)
|
||||
return {
|
||||
"uid": "atlas-gpu",
|
||||
"title": "Atlas GPU",
|
||||
|
||||
@ -166,6 +166,14 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
|
||||
assert 'namespace", "idle"' in gpu_expr
|
||||
assert panels_by_title["Namespace GPU Utilization"]["targets"][0]["instant"] is True
|
||||
|
||||
gpu_pool_expr = panels_by_title["GPU Pool Used"]["targets"][0]["expr"]
|
||||
assert "nvidia_namespace_gpu_sm_util_percent" in gpu_pool_expr
|
||||
assert "nvidia_gpu_device_utilization_percent" in gpu_pool_expr
|
||||
assert panels_by_title["GPU Pool Used"]["targets"][0]["instant"] is True
|
||||
active_targets = panels_by_title["GPU Active Devices"]["targets"]
|
||||
assert any("nvidia_gpu_device_utilization_percent > bool 0" in target["expr"] for target in active_targets)
|
||||
assert any("count(nvidia_gpu_device_utilization_percent)" in target["expr"] for target in active_targets)
|
||||
|
||||
|
||||
def test_overview_and_testing_panels_all_have_concise_descriptions():
|
||||
mod = load_module()
|
||||
|
||||
@ -189,6 +189,147 @@
|
||||
}
|
||||
],
|
||||
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "GPU Pool Used",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "dark-orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "dark-red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 1
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
},
|
||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "GPU Active Devices",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "active",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "B",
|
||||
"legendFormat": "total",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "name_and_value"
|
||||
},
|
||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -3643,6 +3643,161 @@
|
||||
},
|
||||
"description": "Database with the most active connections; high values identify the pressure source."
|
||||
},
|
||||
{
|
||||
"id": 48,
|
||||
"type": "stat",
|
||||
"title": "GPU Pool Used",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"w": 4,
|
||||
"x": 8,
|
||||
"y": 21
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "dark-orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "dark-red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 1
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-gpu dashboard",
|
||||
"url": "/d/atlas-gpu",
|
||||
"targetBlank": true
|
||||
}
|
||||
],
|
||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
||||
},
|
||||
{
|
||||
"id": 49,
|
||||
"type": "stat",
|
||||
"title": "GPU Active Devices",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"w": 4,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "active",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "B",
|
||||
"legendFormat": "total",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "name_and_value"
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-gpu dashboard",
|
||||
"url": "/d/atlas-gpu",
|
||||
"targetBlank": true
|
||||
}
|
||||
],
|
||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "piechart",
|
||||
|
||||
@ -198,6 +198,147 @@ data:
|
||||
}
|
||||
],
|
||||
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "GPU Pool Used",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "dark-orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "dark-red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 1
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
},
|
||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "GPU Active Devices",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 16
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "active",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "B",
|
||||
"legendFormat": "total",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "name_and_value"
|
||||
},
|
||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
|
||||
@ -3652,6 +3652,161 @@ data:
|
||||
},
|
||||
"description": "Database with the most active connections; high values identify the pressure source."
|
||||
},
|
||||
{
|
||||
"id": 48,
|
||||
"type": "stat",
|
||||
"title": "GPU Pool Used",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"w": 4,
|
||||
"x": 8,
|
||||
"y": 21
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) / clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)), 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "dark-orange",
|
||||
"value": 75
|
||||
},
|
||||
{
|
||||
"color": "dark-red",
|
||||
"value": 91.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 1
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-gpu dashboard",
|
||||
"url": "/d/atlas-gpu",
|
||||
"targetBlank": true
|
||||
}
|
||||
],
|
||||
"description": "Current process-level GPU utilization across the monitored NVIDIA GPU pool."
|
||||
},
|
||||
{
|
||||
"id": 49,
|
||||
"type": "stat",
|
||||
"title": "GPU Active Devices",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 2,
|
||||
"w": 4,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((sum(nvidia_gpu_device_utilization_percent > bool 0)) or (sum((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) > bool 0) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "A",
|
||||
"legendFormat": "active",
|
||||
"instant": true
|
||||
},
|
||||
{
|
||||
"expr": "((count(nvidia_gpu_device_utilization_percent)) or (count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0))",
|
||||
"refId": "B",
|
||||
"legendFormat": "total",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "dark-green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
},
|
||||
"decimals": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "name_and_value"
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-gpu dashboard",
|
||||
"url": "/d/atlas-gpu",
|
||||
"targetBlank": true
|
||||
}
|
||||
],
|
||||
"description": "Active NVIDIA GPUs compared with total monitored NVIDIA GPUs."
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "piechart",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user