2025-12-02 13:16:00 -03:00
# services/monitoring/grafana-dashboard-gpu.yaml
apiVersion : v1
kind : ConfigMap
metadata :
name : grafana-dashboard-gpu
labels :
grafana_dashboard : "1"
data :
atlas-gpu.json : |
{
"uid": "atlas-gpu" ,
"title": "Atlas GPU" ,
"folderUid": "atlas-internal" ,
"editable": true ,
"panels": [
{
"id": 1 ,
"type": "piechart" ,
2026-05-21 15:26:02 -03:00
"title": "Namespace GPU Utilization" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 0 ,
"y": 0
},
"targets": [
{
2026-05-22 02:28:08 -03:00
"expr": "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or (label_replace(clamp_min(((100 * count(nvidia_gpu_device_utilization_percent)) or (100 * count(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on() nvidia_gpu_device_utilization_percent) or on() vector(0)) - (sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) or (label_replace((sum(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) or on() vector(0)), \"namespace\", \"unattributed\", \"\", \"\") unless on() ((count(nvidia_gpu_device_utilization_percent) or on() vector(0)) > 0))) or on() vector(0)) == 0))" ,
2025-12-02 13:16:00 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-12-02 13:16:00 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
2026-01-01 14:44:33 -03:00
},
"links": [
{
"title": "Workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank": false
},
{
"title": "All namespaces" ,
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
"targetBlank": false
},
{
"title": "Infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank": false
}
] ,
2026-05-22 02:28:08 -03:00
"description": "Current NVIDIA process-level GPU utilization by namespace. Host covers non-Kubernetes processes; unused fills remaining capacity while active; idle appears only at zero activity."
2025-12-02 13:16:00 -03:00
},
{
"id": 2 ,
"type": "timeseries" ,
2026-05-22 02:28:08 -03:00
"title": "GPU Process Util by Namespace" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 12 ,
"y": 0
},
"targets": [
{
2026-05-22 02:28:08 -03:00
"expr": "sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})" ,
2025-12-02 13:16:00 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "table" ,
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
2026-05-21 13:04:26 -03:00
},
2026-05-22 02:28:08 -03:00
"description": "NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods."
2025-12-02 13:16:00 -03:00
},
{
"id": 3 ,
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "GPU Util by Node" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 0 ,
"y": 8
},
"targets": [
{
2026-01-27 21:43:37 -03:00
"expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")" ,
2025-12-02 13:16:00 -03:00
"refId": "A" ,
"legendFormat": "{{Hostname}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "table" ,
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 4 ,
"type": "table" ,
2026-05-21 13:04:26 -03:00
"title": "GPU Pods Reporting Device Util" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 12 ,
"y": 8
},
"targets": [
{
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" ,
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-12-13 18:23:19 -03:00
"unit": "percent" ,
"custom": {
"filterable": true
}
2025-12-02 13:16:00 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-13 18:23:19 -03:00
"showHeader": true ,
"columnFilters": false
2025-12-02 13:16:00 -03:00
},
"transformations": [
{
"id": "labelsToFields" ,
"options": {}
}
2026-05-21 13:04:26 -03:00
] ,
2026-05-22 01:55:25 -03:00
"description": "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
2025-12-02 13:16:00 -03:00
}
] ,
"time": {
"from": "now-12h" ,
"to": "now"
},
"annotations": {
"list": [ ]
},
"schemaVersion": 39 ,
"style": "dark" ,
"tags": [
"atlas" ,
"gpu"
2026-01-01 14:16:08 -03:00
] ,
"templating": {
"list": [
{
2026-01-01 14:44:33 -03:00
"name": "namespace_scope_cpu" ,
"label": "CPU namespace filter" ,
"type": "custom" ,
2026-01-18 02:50:07 -03:00
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current": {
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
"options": [
{
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
{
"text": "all namespaces" ,
"value": "namespace=~\".*\"" ,
"selected": false
},
{
"text": "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": false
}
] ,
"hide": 2 ,
"multi": false ,
"includeAll": false ,
"refresh": 1 ,
"sort": 0 ,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu" ,
"label": "GPU namespace filter" ,
"type": "custom" ,
2026-01-18 02:50:07 -03:00
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current": {
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
"options": [
{
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
{
"text": "all namespaces" ,
"value": "namespace=~\".*\"" ,
"selected": false
},
{
"text": "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": false
}
] ,
"hide": 2 ,
"multi": false ,
"includeAll": false ,
"refresh": 1 ,
"sort": 0 ,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram" ,
"label": "RAM namespace filter" ,
2026-01-01 14:16:08 -03:00
"type": "custom" ,
2026-01-18 02:50:07 -03:00
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"current": {
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected": true
},
"options": [
{
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected": true
},
{
"text": "all namespaces" ,
"value": "namespace=~\".*\"" ,
"selected": false
},
{
"text": "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected": false
}
] ,
2026-01-01 14:44:33 -03:00
"hide": 2 ,
2026-01-01 14:16:08 -03:00
"multi": false ,
"includeAll": false ,
"refresh": 1 ,
"sort": 0 ,
"skipUrlSync": false
}
]
}
2025-12-02 13:16:00 -03:00
}