# services/monitoring/grafana-dashboard-sre.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-sre labels: grafana_dashboard: "1" data: atlas-sre-overview.json: | { "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "folderUid": "atlas-sre", "graphTooltip": 0, "links": [], "panels": [ { "id": 1, "type": "stat", "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 6, "x": 0, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "red", "value": null }, { "color": "yellow", "value": 95 }, { "color": "green", "value": 99 } ] }, "unit": "percent" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 2, "type": "stat", "title": "Pending pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 6, "x": 6, "y": 0 }, "targets": [ { "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 3 }, { "color": "red", "value": 10 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 3, "type": "stat", "title": "Unavailable deployment replicas", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 6, "x": 12, "y": 0 }, "targets": [ { "expr": "sum(kube_deployment_status_replicas_unavailable)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 3 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 4, "type": "stat", "title": "Active alerts", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 6, "x": 18, "y": 0 }, "targets": [ { "expr": "sum(ALERTS{alertstate=\"firing\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 3 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 5, "type": "timeseries", "title": "Node CPU usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 5 }, "targets": [ { "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 6, "type": "timeseries", "title": "Node memory usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 5 }, "targets": [ { "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 7, "type": "timeseries", "title": "Top pod CPU (5m avg)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 14 }, "targets": [ { "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}" } ], "fieldConfig": { "defaults": { "unit": "cores" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 8, "type": "timeseries", "title": "Top pod memory working set", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 14 }, "targets": [ { "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}" } ], "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 9, "type": "bargauge", "title": "Namespace restart rate (6h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, "targets": [ { "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 10, "type": "table", "title": "Deployments missing replicas", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 }, "targets": [ { "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, "options": { "showHeader": true } }, { "id": 11, "type": "timeseries", "title": "Pod phase breakdown", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 31 }, "targets": [ { "expr": "sum(kube_pod_status_phase) by (phase)", "refId": "A", "legendFormat": "{{phase}}" } ], "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 12, "type": "timeseries", "title": "PVC usage (top 8)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 31 }, "targets": [ { "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))", "refId": "A", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } } ], "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "sre" ], "templating": { "list": [] }, "time": { "from": "now-12h", "to": "now" }, "title": "Atlas SRE Overview", "uid": "atlas-sre", "version": 4 }