2025-12-02 13:16:00 -03:00
# services/monitoring/grafana-dashboard-gpu.yaml
apiVersion : v1
kind : ConfigMap
metadata :
name : grafana-dashboard-gpu
labels :
grafana_dashboard : "1"
data :
atlas-gpu.json : |
{
"uid": "atlas-gpu" ,
"title": "Atlas GPU" ,
"folderUid": "atlas-internal" ,
"editable": true ,
"panels": [
{
"id": 1 ,
"type": "piechart" ,
2025-12-02 14:41:39 -03:00
"title": "Namespace GPU Share" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 0 ,
"y": 0
},
"targets": [
{
2026-01-01 14:44:33 -03:00
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" ,
2025-12-02 13:16:00 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-12-02 13:16:00 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
2026-01-01 14:44:33 -03:00
},
"links": [
{
"title": "Workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank": false
},
{
"title": "All namespaces" ,
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
"targetBlank": false
},
{
"title": "Infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank": false
}
] ,
2026-01-18 02:50:07 -03:00
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
2025-12-02 13:16:00 -03:00
},
{
"id": 2 ,
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "GPU Util by Namespace" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 12 ,
"y": 0
},
"targets": [
{
2026-01-01 14:44:33 -03:00
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" ,
2025-12-02 13:16:00 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "table" ,
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 3 ,
"type": "timeseries" ,
2025-12-02 14:41:39 -03:00
"title": "GPU Util by Node" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 0 ,
"y": 8
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" ,
"refId": "A" ,
"legendFormat": "{{Hostname}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "table" ,
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 4 ,
"type": "table" ,
2025-12-02 14:41:39 -03:00
"title": "Top Pods by GPU Util" ,
2025-12-02 13:16:00 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 12 ,
"y": 8
},
"targets": [
{
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" ,
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-12-13 18:23:19 -03:00
"unit": "percent" ,
"custom": {
"filterable": true
}
2025-12-02 13:16:00 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-13 18:23:19 -03:00
"showHeader": true ,
"columnFilters": false
2025-12-02 13:16:00 -03:00
},
"transformations": [
{
"id": "labelsToFields" ,
"options": {}
}
]
}
] ,
"time": {
"from": "now-12h" ,
"to": "now"
},
"annotations": {
"list": [ ]
},
"schemaVersion": 39 ,
"style": "dark" ,
"tags": [
"atlas" ,
"gpu"
2026-01-01 14:16:08 -03:00
] ,
"templating": {
"list": [
{
2026-01-01 14:44:33 -03:00
"name": "namespace_scope_cpu" ,
"label": "CPU namespace filter" ,
"type": "custom" ,
2026-01-18 02:50:07 -03:00
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current": {
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
"options": [
{
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
{
"text": "all namespaces" ,
"value": "namespace=~\".*\"" ,
"selected": false
},
{
"text": "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": false
}
] ,
"hide": 2 ,
"multi": false ,
"includeAll": false ,
"refresh": 1 ,
"sort": 0 ,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu" ,
"label": "GPU namespace filter" ,
"type": "custom" ,
2026-01-18 02:50:07 -03:00
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current": {
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
"options": [
{
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": true
},
{
"text": "all namespaces" ,
"value": "namespace=~\".*\"" ,
"selected": false
},
{
"text": "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected": false
}
] ,
"hide": 2 ,
"multi": false ,
"includeAll": false ,
"refresh": 1 ,
"sort": 0 ,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram" ,
"label": "RAM namespace filter" ,
2026-01-01 14:16:08 -03:00
"type": "custom" ,
2026-01-18 02:50:07 -03:00
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"current": {
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected": true
},
"options": [
{
"text": "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected": true
},
{
"text": "all namespaces" ,
"value": "namespace=~\".*\"" ,
"selected": false
},
{
"text": "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected": false
}
] ,
2026-01-01 14:44:33 -03:00
"hide": 2 ,
2026-01-01 14:16:08 -03:00
"multi": false ,
"includeAll": false ,
"refresh": 1 ,
"sort": 0 ,
"skipUrlSync": false
}
]
}
2025-12-02 13:16:00 -03:00
}