2025-12-02 13:16:00 -03:00
{
"uid" : "atlas-gpu" ,
"title" : "Atlas GPU" ,
"folderUid" : "atlas-internal" ,
"editable" : true ,
"panels" : [
{
"id" : 1 ,
"type" : "piechart" ,
2025-12-02 14:41:39 -03:00
"title" : "Namespace GPU Share" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 0 ,
"y" : 0
} ,
"targets" : [
{
2026-01-01 14:44:33 -03:00
"expr" : "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{namespace}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent" ,
"color" : {
"mode" : "palette-classic"
}
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "list" ,
"placement" : "right"
} ,
"pieType" : "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels" : [ ] ,
2025-12-02 13:16:00 -03:00
"tooltip" : {
"mode" : "single"
} ,
"colorScheme" : "interpolateSpectral" ,
"colorBy" : "value" ,
"reduceOptions" : {
"calcs" : [
"lastNotNull"
] ,
"fields" : "" ,
"values" : false
}
2026-01-01 14:44:33 -03:00
} ,
"links" : [
{
"title" : "Workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank" : false
} ,
{
"title" : "All namespaces" ,
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
"targetBlank" : false
} ,
{
"title" : "Infrastructure namespaces only" ,
2026-01-11 23:55:43 -03:00
"url" : "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%5E%28kube-system%7Clonghorn-system%7Cmetallb-system%7Cmonitoring%7Clogging%7Cflux-system%7Ctraefik%7Cmaintenance%7Cpostgres%29%24%22&var-namespace_scope_ram=${namespace_scope_ram}" ,
2026-01-01 14:44:33 -03:00
"targetBlank" : false
}
] ,
2026-01-05 13:30:33 -03:00
"description" : "Values are normalized within the selected scope; use panel links to switch scope."
2025-12-02 13:16:00 -03:00
} ,
{
"id" : 2 ,
"type" : "timeseries" ,
2025-12-02 14:41:39 -03:00
"title" : "GPU Util by Namespace" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 12 ,
"y" : 0
} ,
"targets" : [
{
2026-01-01 14:44:33 -03:00
"expr" : "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" ,
2025-12-02 13:16:00 -03:00
"refId" : "A" ,
"legendFormat" : "{{namespace}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent"
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "table" ,
"placement" : "right"
} ,
"tooltip" : {
"mode" : "multi"
}
}
} ,
{
"id" : 3 ,
"type" : "timeseries" ,
2025-12-02 14:41:39 -03:00
"title" : "GPU Util by Node" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 0 ,
"y" : 8
} ,
"targets" : [
{
"expr" : "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" ,
"refId" : "A" ,
"legendFormat" : "{{Hostname}}"
}
] ,
"fieldConfig" : {
"defaults" : {
"unit" : "percent"
} ,
"overrides" : [ ]
} ,
"options" : {
"legend" : {
"displayMode" : "table" ,
"placement" : "right"
} ,
"tooltip" : {
"mode" : "multi"
}
}
} ,
{
"id" : 4 ,
"type" : "table" ,
2025-12-02 14:41:39 -03:00
"title" : "Top Pods by GPU Util" ,
2025-12-02 13:16:00 -03:00
"datasource" : {
"type" : "prometheus" ,
"uid" : "atlas-vm"
} ,
"gridPos" : {
"h" : 8 ,
"w" : 12 ,
"x" : 12 ,
"y" : 8
} ,
"targets" : [
{
"expr" : "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" ,
"refId" : "A"
}
] ,
"fieldConfig" : {
"defaults" : {
2025-12-13 18:23:19 -03:00
"unit" : "percent" ,
"custom" : {
"filterable" : true
}
2025-12-02 13:16:00 -03:00
} ,
"overrides" : [ ]
} ,
"options" : {
2025-12-13 18:23:19 -03:00
"showHeader" : true ,
"columnFilters" : false
2025-12-02 13:16:00 -03:00
} ,
"transformations" : [
{
"id" : "labelsToFields" ,
"options" : { }
}
]
}
] ,
"time" : {
"from" : "now-12h" ,
"to" : "now"
} ,
"annotations" : {
"list" : [ ]
} ,
"schemaVersion" : 39 ,
"style" : "dark" ,
"tags" : [
"atlas" ,
"gpu"
2026-01-01 14:16:08 -03:00
] ,
"templating" : {
"list" : [
{
2026-01-01 14:44:33 -03:00
"name" : "namespace_scope_cpu" ,
"label" : "CPU namespace filter" ,
"type" : "custom" ,
2026-01-11 23:55:43 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : false
}
] ,
"hide" : 2 ,
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
} ,
{
"name" : "namespace_scope_gpu" ,
"label" : "GPU namespace filter" ,
"type" : "custom" ,
2026-01-11 23:55:43 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:44:33 -03:00
"selected" : false
}
] ,
"hide" : 2 ,
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
} ,
{
"name" : "namespace_scope_ram" ,
"label" : "RAM namespace filter" ,
2026-01-01 14:16:08 -03:00
"type" : "custom" ,
2026-01-11 23:55:43 -03:00
"query" : "workload namespaces only : namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"current" : {
"text" : "workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : true
} ,
"options" : [
{
"text" : "workload namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace!~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : true
} ,
{
"text" : "all namespaces" ,
"value" : "namespace=~\".*\"" ,
"selected" : false
} ,
{
"text" : "infrastructure namespaces only" ,
2026-01-11 23:55:43 -03:00
"value" : "namespace=~\"^(kube-system|longhorn-system|metallb-system|monitoring|logging|flux-system|traefik|maintenance|postgres)$\"" ,
2026-01-01 14:16:08 -03:00
"selected" : false
}
] ,
2026-01-01 14:44:33 -03:00
"hide" : 2 ,
2026-01-01 14:16:08 -03:00
"multi" : false ,
"includeAll" : false ,
"refresh" : 1 ,
"sort" : 0 ,
"skipUrlSync" : false
}
]
}
2025-12-02 13:16:00 -03:00
}