# services/monitoring/grafana-dashboard-gpu.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-gpu labels: grafana_dashboard: "1" data: atlas-gpu.json: | { "uid": "atlas-gpu", "title": "Atlas GPU", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "piechart", "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "targets": [ { "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } }, "links": [ { "title": "Workload namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "All namespaces", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false }, { "title": "Infrastructure namespaces only", "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}", "targetBlank": false } ], "description": "Shares are normalized within the selected filter. Switching scope changes the denominator." }, { "id": 2, "type": "timeseries", "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "targets": [ { "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 3, "type": "timeseries", "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "targets": [ { "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{Hostname}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 4, "type": "table", "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "targets": [ { "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "percent", "custom": { "filterable": true } }, "overrides": [] }, "options": { "showHeader": true, "columnFilters": false }, "transformations": [ { "id": "labelsToFields", "options": {} } ] } ], "time": { "from": "now-12h", "to": "now" }, "annotations": { "list": [] }, "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "gpu" ], "templating": { "list": [ { "name": "namespace_scope_cpu", "label": "CPU namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false }, { "name": "namespace_scope_gpu", "label": "GPU namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false }, { "name": "namespace_scope_ram", "label": "RAM namespace filter", "type": "custom", "query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "current": { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, "options": [ { "text": "workload namespaces only", "value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": true }, { "text": "all namespaces", "value": "namespace=~\".*\"", "selected": false }, { "text": "infrastructure namespaces only", "value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"", "selected": false } ], "hide": 2, "multi": false, "includeAll": false, "refresh": 1, "sort": 0, "skipUrlSync": false } ] } }