311 lines
9.4 KiB
JSON
311 lines
9.4 KiB
JSON
{
|
|
"uid": "atlas-gpu",
|
|
"title": "Atlas GPU",
|
|
"folderUid": "atlas-internal",
|
|
"editable": true,
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"type": "piechart",
|
|
"title": "Namespace GPU Share",
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "atlas-vm"
|
|
},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
|
|
"refId": "A",
|
|
"legendFormat": "{{namespace}}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"color": {
|
|
"mode": "palette-classic"
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"legend": {
|
|
"displayMode": "list",
|
|
"placement": "right"
|
|
},
|
|
"pieType": "pie",
|
|
"displayLabels": [],
|
|
"tooltip": {
|
|
"mode": "single"
|
|
},
|
|
"colorScheme": "interpolateSpectral",
|
|
"colorBy": "value",
|
|
"reduceOptions": {
|
|
"calcs": [
|
|
"lastNotNull"
|
|
],
|
|
"fields": "",
|
|
"values": false
|
|
}
|
|
},
|
|
"links": [
|
|
{
|
|
"title": "Workload namespaces only",
|
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
|
"targetBlank": false
|
|
},
|
|
{
|
|
"title": "All namespaces",
|
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
|
"targetBlank": false
|
|
},
|
|
{
|
|
"title": "Infrastructure namespaces only",
|
|
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
|
|
"targetBlank": false
|
|
}
|
|
],
|
|
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
|
|
},
|
|
{
|
|
"id": 2,
|
|
"type": "timeseries",
|
|
"title": "GPU Util by Namespace",
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "atlas-vm"
|
|
},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 0
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
|
|
"refId": "A",
|
|
"legendFormat": "{{namespace}}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"legend": {
|
|
"displayMode": "table",
|
|
"placement": "right"
|
|
},
|
|
"tooltip": {
|
|
"mode": "multi"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 3,
|
|
"type": "timeseries",
|
|
"title": "GPU Util by Node",
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "atlas-vm"
|
|
},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 8
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
|
|
"refId": "A",
|
|
"legendFormat": "{{Hostname}}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent"
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"legend": {
|
|
"displayMode": "table",
|
|
"placement": "right"
|
|
},
|
|
"tooltip": {
|
|
"mode": "multi"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 4,
|
|
"type": "table",
|
|
"title": "Top Pods by GPU Util",
|
|
"datasource": {
|
|
"type": "prometheus",
|
|
"uid": "atlas-vm"
|
|
},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 12,
|
|
"y": 8
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"custom": {
|
|
"filterable": true
|
|
}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"showHeader": true,
|
|
"columnFilters": false
|
|
},
|
|
"transformations": [
|
|
{
|
|
"id": "labelsToFields",
|
|
"options": {}
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"time": {
|
|
"from": "now-12h",
|
|
"to": "now"
|
|
},
|
|
"annotations": {
|
|
"list": []
|
|
},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": [
|
|
"atlas",
|
|
"gpu"
|
|
],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"name": "namespace_scope_cpu",
|
|
"label": "CPU namespace filter",
|
|
"type": "custom",
|
|
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"current": {
|
|
"text": "workload namespaces only",
|
|
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": true
|
|
},
|
|
"options": [
|
|
{
|
|
"text": "workload namespaces only",
|
|
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": true
|
|
},
|
|
{
|
|
"text": "all namespaces",
|
|
"value": "namespace=~\".*\"",
|
|
"selected": false
|
|
},
|
|
{
|
|
"text": "infrastructure namespaces only",
|
|
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": false
|
|
}
|
|
],
|
|
"hide": 2,
|
|
"multi": false,
|
|
"includeAll": false,
|
|
"refresh": 1,
|
|
"sort": 0,
|
|
"skipUrlSync": false
|
|
},
|
|
{
|
|
"name": "namespace_scope_gpu",
|
|
"label": "GPU namespace filter",
|
|
"type": "custom",
|
|
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"current": {
|
|
"text": "workload namespaces only",
|
|
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": true
|
|
},
|
|
"options": [
|
|
{
|
|
"text": "workload namespaces only",
|
|
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": true
|
|
},
|
|
{
|
|
"text": "all namespaces",
|
|
"value": "namespace=~\".*\"",
|
|
"selected": false
|
|
},
|
|
{
|
|
"text": "infrastructure namespaces only",
|
|
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": false
|
|
}
|
|
],
|
|
"hide": 2,
|
|
"multi": false,
|
|
"includeAll": false,
|
|
"refresh": 1,
|
|
"sort": 0,
|
|
"skipUrlSync": false
|
|
},
|
|
{
|
|
"name": "namespace_scope_ram",
|
|
"label": "RAM namespace filter",
|
|
"type": "custom",
|
|
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"current": {
|
|
"text": "workload namespaces only",
|
|
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": true
|
|
},
|
|
"options": [
|
|
{
|
|
"text": "workload namespaces only",
|
|
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": true
|
|
},
|
|
{
|
|
"text": "all namespaces",
|
|
"value": "namespace=~\".*\"",
|
|
"selected": false
|
|
},
|
|
{
|
|
"text": "infrastructure namespaces only",
|
|
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
|
|
"selected": false
|
|
}
|
|
],
|
|
"hide": 2,
|
|
"multi": false,
|
|
"includeAll": false,
|
|
"refresh": 1,
|
|
"sort": 0,
|
|
"skipUrlSync": false
|
|
}
|
|
]
|
|
}
|
|
}
|