{ "uid": "atlas-gpu", "title": "Atlas GPU", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "piechart", "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, "targets": [ { "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "color": { "mode": "palette-classic" } }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "displayLabels": [ "percent" ], "tooltip": { "mode": "single" }, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 2, "type": "timeseries", "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, "targets": [ { "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 3, "type": "timeseries", "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "targets": [ { "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", "refId": "A", "legendFormat": "{{Hostname}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right" }, "tooltip": { "mode": "multi" } } }, { "id": 4, "type": "table", "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "targets": [ { "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] } ], "time": { "from": "now-12h", "to": "now" }, "annotations": { "list": [] }, "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "gpu" ] }