titan-iac/services/monitoring/grafana-dashboard-gpu.yaml

320 lines
14 KiB
YAML

# services/monitoring/grafana-dashboard-gpu.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-gpu
labels:
grafana_dashboard: "1"
data:
atlas-gpu.json: |
{
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "piechart",
"title": "Namespace GPU Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator."
},
{
"id": 2,
"type": "timeseries",
"title": "GPU Util by Namespace",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 3,
"type": "timeseries",
"title": "GPU Util by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"targets": [
{
"expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
"refId": "A",
"legendFormat": "{{Hostname}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 4,
"type": "table",
"title": "Top Pods by GPU Util",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"targets": [
{
"expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"custom": {
"filterable": true
}
},
"overrides": []
},
"options": {
"showHeader": true,
"columnFilters": false
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"gpu"
],
"templating": {
"list": [
{
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
}
]
}
}