# services/monitoring/grafana-dashboard-gpu.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-gpu labels: grafana_dashboard: "1" data: atlas-gpu.json: | { "uid": "atlas-gpu", "title": "Atlas GPU", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "piechart", "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "targets": [ { "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Workload namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "All namespaces", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "Infrastructure namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], "description": "Values are normalized within the selected scope; use panel links to switch scope." }, { "id": 2, "type": "timeseries", "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "targets": [ { "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 3, "type": "timeseries", "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "targets": [ { "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", "refId": "A", "legendFormat": "{{Hostname}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 4, "type": "table", "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "targets": [ { "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "percent", "custom": { "filterable": true } }, "overrides": [] }, "options": { "showHeader": true, "columnFilters": false }, "transformations": [ { "id": "labelsToFields", "options": {} } ] } ], "time": { "from": "now-12h", "to": "now" }, "annotations": { "list": [] }, "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "gpu" ], "templating": { "list": [ { "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false }, { "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false }, { "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false } ] } }