2025-12-02 13:16:00 -03:00
{
"uid" : "atlas-gpu" ,
"title" : "Atlas GPU" ,
"folderUid" : "atlas-internal" ,
"editable" : true ,
"panels" : [
{
"id" : 1 ,
"type" : "piechart" ,
2026-05-21 15:26:02 -03:00
"title" : "Namespace GPU Utilization" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 0 ,
"y" : 0
} ,
"targets" : [
{
2026-05-21 15:26:02 -03:00
"expr" : "(100 * ((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) / clamp_min((sum((sum by (namespace) (avg_over_time(((max by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) == 1))[$__range:$__interval]))) or (label_replace(sum(avg_over_time(((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) and on(UUID) (count by (UUID) (count by (UUID,namespace) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu})) > 1))[$__range:$__interval])), \"namespace\", \"shared\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(max_over_time((max by (UUID) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\",namespace!=\"\",$namespace_scope_gpu}))[$__range:$__interval])) or on() vector(0)) == 0))" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{namespace}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent" ,
"color" : {
"mode" : "palette-classic"
}
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "list" ,
"placement" : "right"
} ,
"pieType" : "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels" : [ ] ,
2025-12-02 13:16:00 -03:00
"tooltip" : {
"mode" : "single"
} ,
"colorScheme" : "interpolateSpectral" ,
"colorBy" : "value" ,
"reduceOptions" : {
"calcs" : [
"lastNotNull"
] ,
"fields" : "" ,
"values" : false
}
2026-01-01 14:44:33 -03:00
} ,
"links" : [
{
"title" : "Workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank" : false
} ,
{
"title" : "All namespaces" ,
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
"targetBlank" : false
} ,
{
"title" : "Infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-.%2A%7C.%2A-system%7Ctraefik%7Cmonitoring%7Clogging%7Ccert-manager%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank" : false
}
] ,
2026-05-21 15:26:02 -03:00
"description" : "Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero."
2025-12-02 13:16:00 -03:00
} ,
{
"id" : 2 ,
"type" : "timeseries" ,
2026-05-21 13:04:26 -03:00
"title" : "GPU Activity by Reservation" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 12 ,
"y" : 0
} ,
"targets" : [
{
2026-05-16 05:58:59 -03:00
"expr" : "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia(_com_|[.]com/)gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (max by (node) (kube_node_status_allocatable{resource=~\"nvidia(_com_|[.]com/)gpu.*\"} > bool 0)))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{namespace}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent"
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "table" ,
"placement" : "right"
} ,
"tooltip" : {
"mode" : "multi"
}
2026-05-21 13:04:26 -03:00
} ,
"description" : "Node/device GPU activity attributed by each namespace's GPU reservation on that node."
2025-12-02 13:16:00 -03:00
} ,
{
"id" : 3 ,
"type" : "timeseries" ,
2025-12-02 14:41:39 -03:00
"title" : "GPU Util by Node" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 0 ,
"y" : 8
} ,
"targets" : [
{
2026-01-27 21:43:37 -03:00
"expr" : "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{Hostname}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent"
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "table" ,
"placement" : "right"
} ,
"tooltip" : {
"mode" : "multi"
}
}
} ,
{
"id" : 4 ,
"type" : "table" ,
2026-05-21 13:04:26 -03:00
"title" : "GPU Pods Reporting Device Util" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 12 ,
"y" : 8
} ,
"targets" : [
{
"expr" : "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" ,
"refId" : "A"
}
] ,
"fieldConfig" : {
"defaults" : {
2025-12-13 18:23:19 -03:00
"unit" : "percent" ,
"custom" : {
"filterable" : true
}
2025-12-02 13:16:00 -03:00
} ,
"overrides" : [ ]
} ,
"options" : {
2025-12-13 18:23:19 -03:00
"showHeader" : true ,
"columnFilters" : false
2025-12-02 13:16:00 -03:00
} ,
"transformations" : [
{
"id" : "labelsToFields" ,
"options" : { }
}
2026-05-21 13:04:26 -03:00
] ,
"description" : "DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value."
2025-12-02 13:16:00 -03:00
}
] ,
"time" : {
"from" : "now-12h" ,
"to" : "now"
} ,
"annotations" : {
"list" : [ ]
} ,
"schemaVersion" : 39 ,
"style" : "dark" ,
"tags" : [
"atlas" ,
"gpu"
2026-01-01 14:16:08 -03:00
] ,
"templating" : {
"list" : [
{
2026-01-01 14:44:33 -03:00
"name" : "namespace_scope_cpu" ,
"label" : "CPU namespace filter" ,
"type" : "custom" ,
2026-01-18 02:50:07 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : false
}
] ,
"hide" : 2 ,
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
} ,
{
"name" : "namespace_scope_gpu" ,
"label" : "GPU namespace filter" ,
"type" : "custom" ,
2026-01-18 02:50:07 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : false
}
] ,
"hide" : 2 ,
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
} ,
{
"name" : "namespace_scope_ram" ,
"label" : "RAM namespace filter" ,
2026-01-01 14:16:08 -03:00
"type" : "custom" ,
2026-01-18 02:50:07 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-18 02:50:07 -03:00
"value" : "namespace=~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : false
}
] ,
2026-01-01 14:44:33 -03:00
"hide" : 2 ,
2026-01-01 14:16:08 -03:00
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
}
]
}
2025-12-02 13:16:00 -03:00
}