2025-12-02 13:16:00 -03:00
{
"uid" : "atlas-gpu" ,
"title" : "Atlas GPU" ,
"folderUid" : "atlas-internal" ,
"editable" : true ,
"panels" : [
{
"id" : 1 ,
"type" : "piechart" ,
2026-05-21 15:26:02 -03:00
"title" : "Namespace GPU Utilization" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 0 ,
"y" : 0
} ,
"targets" : [
{
2026-05-22 03:22:57 -03:00
"expr" : "(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or (label_replace(clamp_min(100 - (sum(100 * ((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)) or on() vector(0)), 0), \"namespace\", \"unused\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) > 0)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) == 0))" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
2026-05-22 02:55:24 -03:00
"legendFormat" : "{{namespace}}" ,
"instant" : true
2025-12-02 13:16:00 -03:00
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent" ,
"color" : {
"mode" : "palette-classic"
}
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "list" ,
"placement" : "right"
} ,
"pieType" : "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels" : [ ] ,
2025-12-02 13:16:00 -03:00
"tooltip" : {
"mode" : "single"
} ,
"colorScheme" : "interpolateSpectral" ,
"colorBy" : "value" ,
"reduceOptions" : {
"calcs" : [
"lastNotNull"
] ,
"fields" : "" ,
"values" : false
}
2026-01-01 14:44:33 -03:00
} ,
"links" : [
{
"title" : "Workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank" : false
} ,
{
"title" : "All namespaces" ,
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
"targetBlank" : false
} ,
{
"title" : "Infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank" : false
}
] ,
2026-05-22 03:22:57 -03:00
"description" : "Instant GPU utilization normalized to the monitored GPU pool. Process-level namespace slices appear where available; unattributed covers GPU nodes without process attribution."
2025-12-02 13:16:00 -03:00
} ,
{
"id" : 2 ,
"type" : "timeseries" ,
2026-05-22 02:28:08 -03:00
"title" : "GPU Process Util by Namespace" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 12 ,
"y" : 0
} ,
"targets" : [
{
2026-05-22 02:35:08 -03:00
"expr" : "((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{namespace}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent"
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "table" ,
"placement" : "right"
} ,
"tooltip" : {
"mode" : "multi"
}
2026-05-21 13:04:26 -03:00
} ,
2026-05-22 02:28:08 -03:00
"description" : "NVML process-level SM utilization by namespace. Host covers GPU work outside Kubernetes pods."
2025-12-02 13:16:00 -03:00
} ,
{
"id" : 3 ,
"type" : "timeseries" ,
2025-12-02 14:41:39 -03:00
"title" : "GPU Util by Node" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 0 ,
"y" : 8
} ,
"targets" : [
{
2026-01-27 21:43:37 -03:00
"expr" : "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{Hostname}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent"
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "table" ,
"placement" : "right"
} ,
"tooltip" : {
"mode" : "multi"
}
}
} ,
{
"id" : 4 ,
"type" : "table" ,
2026-05-21 13:04:26 -03:00
"title" : "GPU Pods Reporting Device Util" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 12 ,
"y" : 8
} ,
"targets" : [
{
"expr" : "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" ,
"refId" : "A"
}
] ,
"fieldConfig" : {
"defaults" : {
2025-12-13 18:23:19 -03:00
"unit" : "percent" ,
"custom" : {
"filterable" : true
}
2025-12-02 13:16:00 -03:00
} ,
"overrides" : [ ]
} ,
"options" : {
2025-12-13 18:23:19 -03:00
"showHeader" : true ,
"columnFilters" : false
2025-12-02 13:16:00 -03:00
} ,
"transformations" : [
{
"id" : "labelsToFields" ,
"options" : { }
}
2026-05-21 13:04:26 -03:00
] ,
2026-05-22 01:55:25 -03:00
"description" : "DCGM labels the device utilization sample with GPU-consuming pods; multiple pods on one device can report the same value."
2026-05-22 03:08:27 -03:00
} ,
{
"id" : 5 ,
"type" : "stat" ,
"title" : "GPU Pool Used" ,
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 3 ,
"w" : 6 ,
"x" : 0 ,
"y" : 16
} ,
"targets" : [
{
2026-05-22 03:22:57 -03:00
"expr" : "100 * (sum((((sum by (namespace) (nvidia_namespace_gpu_sm_util_percent{namespace!=\"\",pod!=\"\",$namespace_scope_gpu})) > 0)) or (label_replace(((sum(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)) > 0), \"namespace\", \"unattributed\", \"\", \"\"))) or on() vector(0)) / clamp_min((100 * ((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))), 1)" ,
2026-05-22 03:08:27 -03:00
"refId" : "A" ,
"instant" : true
}
] ,
"fieldConfig" : {
"defaults" : {
"color" : {
"mode" : "thresholds"
} ,
"mappings" : [ ] ,
"thresholds" : {
"mode" : "absolute" ,
"steps" : [
{
"color" : "dark-green" ,
"value" : null
} ,
{
"color" : "dark-yellow" ,
"value" : 50
} ,
{
"color" : "dark-orange" ,
"value" : 75
} ,
{
"color" : "dark-red" ,
"value" : 91.5
}
]
} ,
"unit" : "percent" ,
"custom" : {
"displayMode" : "auto"
} ,
"decimals" : 1
} ,
"overrides" : [ ]
} ,
"options" : {
"colorMode" : "value" ,
"graphMode" : "area" ,
"justifyMode" : "center" ,
"reduceOptions" : {
"calcs" : [
"lastNotNull"
] ,
"fields" : "" ,
"values" : false
} ,
"textMode" : "value"
} ,
2026-05-22 03:22:57 -03:00
"description" : "Current GPU utilization across the monitored GPU pool, normalized by active telemetry sources."
2026-05-22 03:08:27 -03:00
} ,
{
"id" : 6 ,
"type" : "stat" ,
"title" : "GPU Active Devices" ,
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 3 ,
"w" : 6 ,
"x" : 6 ,
"y" : 16
} ,
"targets" : [
{
2026-05-22 03:22:57 -03:00
"expr" : "((sum((max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) > bool 0) or on() vector(0)) + (sum((((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) > bool 0) or on() vector(0)))" ,
2026-05-22 03:08:27 -03:00
"refId" : "A" ,
"legendFormat" : "active" ,
"instant" : true
} ,
{
2026-05-22 03:22:57 -03:00
"expr" : "((count(max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))) or on() vector(0)) + (count(((avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})) unless on(node) (max by (node,gpu,uuid,model) (last_over_time(nvidia_gpu_device_utilization_percent[5m]))))) or on() vector(0)))" ,
2026-05-22 03:08:27 -03:00
"refId" : "B" ,
"legendFormat" : "total" ,
"instant" : true
}
] ,
"fieldConfig" : {
"defaults" : {
"color" : {
"mode" : "thresholds"
} ,
"mappings" : [ ] ,
"thresholds" : {
"mode" : "absolute" ,
"steps" : [
{
"color" : "rgba(115, 115, 115, 1)" ,
"value" : null
} ,
{
"color" : "dark-green" ,
"value" : 1
}
]
} ,
"unit" : "none" ,
"custom" : {
"displayMode" : "auto"
} ,
"decimals" : 0
} ,
"overrides" : [ ]
} ,
"options" : {
"colorMode" : "value" ,
"graphMode" : "area" ,
"justifyMode" : "center" ,
"reduceOptions" : {
"calcs" : [
"lastNotNull"
] ,
"fields" : "" ,
"values" : false
} ,
"textMode" : "name_and_value"
} ,
2026-05-22 03:22:57 -03:00
"description" : "Active GPU devices compared with total monitored GPU devices."
2025-12-02 13:16:00 -03:00
}
] ,
"time" : {
"from" : "now-12h" ,
"to" : "now"
} ,
"annotations" : {
"list" : [ ]
} ,
"schemaVersion" : 39 ,
"style" : "dark" ,
"tags" : [
"atlas" ,
"gpu"
2026-01-01 14:16:08 -03:00
] ,
"templating" : {
"list" : [
{
2026-01-01 14:44:33 -03:00
"name" : "namespace_scope_cpu" ,
"label" : "CPU namespace filter" ,
"type" : "custom" ,
2026-01-18 02:50:07 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : false
}
] ,
"hide" : 2 ,
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
} ,
{
"name" : "namespace_scope_gpu" ,
"label" : "GPU namespace filter" ,
"type" : "custom" ,
2026-01-18 02:50:07 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : false
}
] ,
"hide" : 2 ,
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
} ,
{
"name" : "namespace_scope_ram" ,
"label" : "RAM namespace filter" ,
2026-01-01 14:16:08 -03:00
"type" : "custom" ,
2026-01-18 02:50:07 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : false
}
] ,
2026-01-01 14:44:33 -03:00
"hide" : 2 ,
2026-01-01 14:16:08 -03:00
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
}
]
}
2025-12-02 13:16:00 -03:00
}