2025-11-17 14:22:46 -03:00
# services/monitoring/grafana-dashboard-pods.yaml
apiVersion : v1
kind : ConfigMap
metadata :
name : grafana-dashboard-pods
labels :
grafana_dashboard : "1"
data :
atlas-pods.json : |
{
"uid": "atlas-pods" ,
"title": "Atlas Pods" ,
2025-11-17 16:27:38 -03:00
"folderUid": "atlas-internal" ,
2025-11-17 14:22:46 -03:00
"editable": true ,
"panels": [
{
"id": 1 ,
2025-11-17 16:27:38 -03:00
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Problem Pods" ,
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 4 ,
"w": 6 ,
"x": 0 ,
"y": 0
},
"targets": [
{
2025-12-12 20:30:00 -03:00
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" ,
2025-11-17 16:27:38 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "red" ,
"value": 1
}
]
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
}
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
}
},
{
"id": 2 ,
"type": "stat" ,
"title": "CrashLoop / ImagePull" ,
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 4 ,
"w": 6 ,
"x": 6 ,
"y": 0
},
"targets": [
{
2025-12-12 20:30:00 -03:00
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" ,
2025-11-17 16:27:38 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "red" ,
"value": 1
}
]
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
}
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
}
},
{
"id": 3 ,
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Stuck Terminating (>10m)" ,
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 4 ,
"w": 6 ,
"x": 12 ,
"y": 0
},
"targets": [
{
2025-12-12 20:30:00 -03:00
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" ,
2025-11-17 16:27:38 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "red" ,
"value": 1
}
]
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
}
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
}
},
{
"id": 4 ,
"type": "stat" ,
2025-12-02 14:41:39 -03:00
"title": "Control Plane Workloads" ,
2025-11-17 16:27:38 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 4 ,
"w": 6 ,
"x": 18 ,
"y": 0
},
"targets": [
{
2025-11-18 16:18:52 -03:00
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})" ,
2025-11-17 16:27:38 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
"color": {
2025-12-12 20:44:20 -03:00
"mode": "thresholds"
2025-11-17 16:27:38 -03:00
},
"mappings": [ ] ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "red" ,
"value": 1
}
]
},
"unit": "none" ,
"custom": {
"displayMode": "auto"
}
},
"overrides": [ ]
},
"options": {
"colorMode": "value" ,
"graphMode": "area" ,
"justifyMode": "center" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
},
"textMode": "value"
}
},
{
"id": 5 ,
2025-11-17 14:22:46 -03:00
"type": "table" ,
2025-12-02 14:41:39 -03:00
"title": "Pods Not Running" ,
2025-11-17 14:22:46 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 10 ,
"w": 24 ,
"x": 0 ,
2025-11-17 16:27:38 -03:00
"y": 4
2025-11-17 14:22:46 -03:00
},
"targets": [
{
2025-11-17 16:27:38 -03:00
"expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" ,
2025-11-17 14:22:46 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-12-13 17:55:52 -03:00
"unit": "s" ,
"custom": {
"filterable": true
}
2025-11-17 14:22:46 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-13 17:35:52 -03:00
"showHeader": true ,
"columnFilters": false
2025-11-17 14:22:46 -03:00
},
"transformations": [
{
"id": "labelsToFields" ,
"options": {}
}
]
},
{
2025-11-17 16:27:38 -03:00
"id": 6 ,
2025-11-17 14:22:46 -03:00
"type": "table" ,
"title": "CrashLoop / ImagePull" ,
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 10 ,
"w": 24 ,
"x": 0 ,
2025-11-17 16:27:38 -03:00
"y": 14
2025-11-17 14:22:46 -03:00
},
"targets": [
{
2025-11-17 16:27:38 -03:00
"expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ,
2025-11-17 14:22:46 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-12-13 17:55:52 -03:00
"unit": "s" ,
"custom": {
"filterable": true
}
2025-11-17 14:22:46 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-13 17:35:52 -03:00
"showHeader": true ,
"columnFilters": false
2025-11-17 14:22:46 -03:00
},
"transformations": [
{
"id": "labelsToFields" ,
"options": {}
}
]
},
{
2025-11-17 16:27:38 -03:00
"id": 7 ,
2025-11-17 14:22:46 -03:00
"type": "table" ,
2025-11-17 16:27:38 -03:00
"title": "Terminating >10m" ,
2025-11-17 14:22:46 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 10 ,
"w": 24 ,
"x": 0 ,
2025-11-17 16:27:38 -03:00
"y": 24
2025-11-17 14:22:46 -03:00
},
"targets": [
{
2025-11-17 18:55:11 -03:00
"expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" ,
2025-11-17 14:22:46 -03:00
"refId": "A"
}
] ,
"fieldConfig": {
"defaults": {
2025-12-13 17:55:52 -03:00
"unit": "s" ,
"custom": {
"filterable": true
}
2025-11-17 14:22:46 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-13 17:35:52 -03:00
"showHeader": true ,
"columnFilters": false
2025-11-17 14:22:46 -03:00
},
"transformations": [
{
"id": "labelsToFields" ,
"options": {}
},
{
"id": "filterByValue" ,
"options": {
"match": "Value" ,
"operator": "gt" ,
"value": 600
}
}
]
2025-12-12 18:32:45 -03:00
},
{
"id": 8 ,
"type": "piechart" ,
2025-12-12 20:30:00 -03:00
"title": "Node Pod Share" ,
2025-12-12 18:32:45 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 12 ,
"y": 34
},
"targets": [
{
2025-12-12 20:40:32 -03:00
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" ,
2025-12-12 18:32:45 -03:00
"refId": "A" ,
"legendFormat": "{{namespace}}"
}
] ,
"fieldConfig": {
"defaults": {
"unit": "percent" ,
"color": {
"mode": "palette-classic"
}
},
"overrides": [ ]
},
"options": {
"legend": {
"displayMode": "list" ,
"placement": "right"
},
"pieType": "pie" ,
2025-12-12 20:40:32 -03:00
"displayLabels": [ ] ,
2025-12-12 18:32:45 -03:00
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral" ,
"colorBy": "value" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
}
2025-12-12 18:45:29 -03:00
},
{
"id": 9 ,
"type": "bargauge" ,
"title": "Top Nodes by Pod Count" ,
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 12 ,
"x": 0 ,
"y": 34
},
"targets": [
{
2025-12-12 19:09:51 -03:00
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))" ,
2025-12-12 18:45:29 -03:00
"refId": "A" ,
2025-12-12 20:30:00 -03:00
"legendFormat": "{{node}}" ,
"instant": true
2025-12-12 18:45:29 -03:00
}
] ,
"fieldConfig": {
"defaults": {
"unit": "none" ,
"min": 0 ,
"max": null ,
"thresholds": {
"mode": "absolute" ,
"steps": [
{
"color": "green" ,
"value": null
},
{
"color": "yellow" ,
"value": 50
},
{
"color": "orange" ,
2025-12-12 20:20:13 -03:00
"value": 75
2025-12-12 18:45:29 -03:00
},
{
"color": "red" ,
2025-12-12 20:20:13 -03:00
"value": 100
2025-12-12 18:45:29 -03:00
}
]
2025-12-12 20:20:13 -03:00
},
"decimals": 0
2025-12-12 18:45:29 -03:00
},
"overrides": [ ]
},
"options": {
"displayMode": "gradient" ,
"orientation": "horizontal" ,
"reduceOptions": {
"calcs": [
"lastNotNull"
] ,
"fields": "" ,
"values": false
}
2025-12-12 18:51:43 -03:00
},
"transformations": [
{
"id": "sortBy" ,
"options": {
"fields": [
"Value"
] ,
"order": "desc"
}
2025-12-12 18:56:13 -03:00
},
{
"id": "limit" ,
"options": {
"limit": 12
}
2025-12-12 18:51:43 -03:00
}
]
2025-12-13 03:57:20 -03:00
},
{
"id": 10 ,
"type": "table" ,
2025-12-13 22:17:47 -03:00
"title": "Namespace Plurality by Node v27" ,
2025-12-13 03:57:20 -03:00
"datasource": {
"type": "prometheus" ,
"uid": "atlas-vm"
},
"gridPos": {
"h": 8 ,
"w": 24 ,
"x": 0 ,
"y": 42
},
"targets": [
{
2025-12-13 19:04:22 -03:00
"expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.022)))))" ,
2025-12-13 04:00:57 -03:00
"refId": "A" ,
2025-12-13 18:23:19 -03:00
"instant": true ,
"format": "table"
2025-12-13 03:57:20 -03:00
}
] ,
"fieldConfig": {
"defaults": {
2025-12-13 17:55:52 -03:00
"unit": "percent" ,
"custom": {
"filterable": false
}
2025-12-13 03:57:20 -03:00
},
"overrides": [ ]
},
"options": {
2025-12-13 17:32:19 -03:00
"showHeader": true ,
2025-12-13 17:35:52 -03:00
"columnFilters": false ,
2025-12-13 18:03:51 -03:00
"showColumnFilters": false ,
"footer": {
"show": false ,
"fields": "" ,
"calcs": [ ]
}
2025-12-13 18:23:19 -03:00
},
"transformations": [
{
"id": "labelsToFields" ,
"options": {}
},
{
"id": "organize" ,
"options": {
"excludeByName": {
"Time": true
}
}
},
2025-12-13 18:25:03 -03:00
{
"id": "filterByValue" ,
"options": {
"match": "Value" ,
"operator": "gt" ,
"value": 0
}
},
2025-12-13 18:23:19 -03:00
{
"id": "sortBy" ,
"options": {
"fields": [
"Value"
] ,
2025-12-13 22:17:47 -03:00
"order": "desc"
}
},
{
"id": "groupBy" ,
"options": {
"fields": {
"namespace": {
"aggregations": [
{
"field": "Value" ,
"operation": "max"
},
{
"field": "node" ,
"operation": "first"
}
]
}
},
"rowBy": [
"namespace"
]
2025-12-13 18:23:19 -03:00
}
}
]
2025-11-17 14:22:46 -03:00
}
] ,
"time": {
"from": "now-12h" ,
"to": "now"
},
"annotations": {
"list": [ ]
},
"schemaVersion": 39 ,
"style": "dark" ,
"tags": [
"atlas" ,
"pods"
]
}