{ "uid": "atlas-pods", "title": "Atlas Pods", "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "stat", "title": "Problem pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 2, "type": "stat", "title": "CrashLoop / ImagePull", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 3, "type": "stat", "title": "Stuck terminating (>10m)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, "targets": [ { "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 4, "type": "stat", "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, "targets": [ { "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "none", "custom": { "displayMode": "auto" } }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 5, "type": "table", "title": "Pods not running", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 4 }, "targets": [ { "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 6, "type": "table", "title": "CrashLoop / ImagePull", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 }, "targets": [ { "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 7, "type": "table", "title": "Terminating >10m", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 }, "targets": [ { "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} }, { "id": "filterByValue", "options": { "match": "Value", "operator": "gt", "value": 600 } } ] } ], "time": { "from": "now-12h", "to": "now" }, "annotations": { "list": [] }, "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "pods" ] }