titan-iac/services/monitoring/grafana-dashboard-pods.yaml

574 lines
15 KiB
YAML

# services/monitoring/grafana-dashboard-pods.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-pods
labels:
grafana_dashboard: "1"
data:
atlas-pods.json: |
{
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "Problem Pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Stuck Terminating (>10m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Control Plane Workloads",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "table",
"title": "Pods Not Running",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 6,
"type": "table",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 14
},
"targets": [
{
"expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 7,
"type": "table",
"title": "Terminating >10m",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 24
},
"targets": [
{
"expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 600
}
}
]
},
{
"id": 8,
"type": "piechart",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"targets": [
{
"expr": "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100",
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "pie",
"displayLabels": [],
"tooltip": {
"mode": "single"
},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 9,
"type": "bargauge",
"title": "Top Nodes by Pod Count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "none",
"min": 0,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 75
},
{
"color": "red",
"value": 100
}
]
},
"decimals": 0
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
},
"transformations": [
{
"id": "sortBy",
"options": {
"fields": [
"Value"
],
"order": "desc"
}
},
{
"id": "limit",
"options": {
"limit": 12
}
}
]
},
{
"id": 10,
"type": "table",
"title": "Namespace Plurality by Node",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 42
},
"targets": [
{
"expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))))",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 0
}
},
{
"id": "sortBy",
"options": {
"fields": [
"node",
"Value"
],
"order": "asc"
}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"pods"
]
}