185 lines
5.2 KiB
JSON
Raw Normal View History

{
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "piechart",
"title": "Namespace GPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [
"percent"
],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 2,
"type": "timeseries",
"title": "GPU Util by Namespace",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 3,
"type": "timeseries",
"title": "GPU Util by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
"refId": "A",
"legendFormat": "{{Hostname}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 4,
"type": "table",
"title": "Top Pods by GPU Util",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"gpu"
]
}