From 0b1437b77c93c5a0ac411983d080938e3030af8c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 15 Nov 2025 21:03:11 -0300 Subject: [PATCH] monitoring: refresh grafana dashboards --- .../monitoring/grafana-dashboard-public.yaml | 545 ++++++++++++++---- .../monitoring/grafana-dashboard-sre.yaml | 527 ++++++++++++++--- services/monitoring/grafana-folders.yaml | 28 + services/monitoring/helmrelease.yaml | 5 + services/monitoring/kustomization.yaml | 1 + 5 files changed, 903 insertions(+), 203 deletions(-) create mode 100644 services/monitoring/grafana-folders.yaml diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml index aee871f..126b1b3 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -25,17 +25,30 @@ data: ] }, "editable": false, - "fiscalYearStartMonth": 0, + "folderUid": "atlas-public", "graphTooltip": 0, - "id": null, "links": [], - "liveNow": false, "panels": [ { + "id": 1, + "type": "stat", + "title": "Running pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], "fieldConfig": { "defaults": { "color": { @@ -46,8 +59,12 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null + }, + { + "color": "green", + "value": 1 } ] }, @@ -55,59 +72,105 @@ data: }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 1, "options": { "colorMode": "value", "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "10.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "editorMode": "code", - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "legendFormat": "", - "range": true, - "refId": "A" } - ], - "title": "Running pods", - "type": "stat" + } }, { + "id": 2, + "type": "stat", + "title": "Ready node percentage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, - "description": "Aggregated CPU usage across all schedulable nodes.", + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "refId": "A" + } + ], "fieldConfig": { "defaults": { "color": { - "mode": "continuous-BlYlRd" + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "green", + "value": 98 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 3, + "type": "stat", + "title": "Cluster CPU saturation", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, "mappings": [], - "max": 100, - "min": 0, "thresholds": { "mode": "percentage", "steps": [ @@ -117,7 +180,7 @@ data: }, { "color": "yellow", - "value": 60 + "value": 65 }, { "color": "red", @@ -129,79 +192,165 @@ data: }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 0 - }, - "id": 2, "options": { "colorMode": "value", "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - }, - "text": {}, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))", - "legendFormat": "", - "refId": "A" } - ], - "title": "Average node CPU", - "type": "stat" + } }, { + "id": 4, + "type": "stat", + "title": "Cluster memory usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 + "h": 6, + "w": 6, + "x": 18, + "y": 0 }, - "id": 3, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) by (namespace)", - "legendFormat": "{{namespace}}", + "expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)", "refId": "A" } ], - "title": "Running pods per namespace", - "type": "bargauge", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, "options": { - "displayMode": "gradient", - "orientation": "horizontal", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false - }, - "showUnfilled": false + } } }, { + "id": 5, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 6 + }, + "targets": [ + { + "expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cores" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 6, + "type": "piechart", + "title": "Namespace memory share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 6 + }, + "targets": [ + { + "expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 7, + "type": "timeseries", + "title": "Node CPU usage (per node)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -212,38 +361,70 @@ data: "x": 0, "y": 15 }, - "id": 4, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", - "legendFormat": "Ready", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})", - "legendFormat": "Not Ready", - "refId": "B" + "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "refId": "A", + "legendFormat": "{{instance}}" } ], - "title": "Node readiness", - "type": "piechart", + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, "options": { "legend": { "displayMode": "table", - "placement": "right" + "placement": "bottom" }, - "pieType": "donut" + "tooltip": { + "mode": "multi" + } } }, { + "id": 8, + "type": "timeseries", + "title": "Node memory usage (per node)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "targets": [ + { + "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 9, + "type": "table", + "title": "Key service availability", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -254,29 +435,39 @@ data: "x": 0, "y": 23 }, - "id": 5, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[1d]))", - "legendFormat": "{{namespace}}", + "expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})", "refId": "A" } ], - "title": "Failed pods (24h)", - "type": "table", "fieldConfig": { "defaults": { - "unit": "none", - "mappings": [], + "mappings": [ + { + "id": 0, + "type": 1, + "value": "0", + "text": "Down" + }, + { + "id": 1, + "type": 1, + "value": "1", + "text": "Up" + } + ], "thresholds": { "mode": "absolute", "steps": [ - {"color": "green", "value": null}, - {"color": "red", "value": 1} + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, @@ -285,6 +476,126 @@ data: "options": { "showHeader": true } + }, + { + "id": 10, + "type": "table", + "title": "Failed pods (24h trend)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "targets": [ + { + "expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "showHeader": true + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Cluster network throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))", + "refId": "A", + "legendFormat": "Receive" + }, + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))", + "refId": "B", + "legendFormat": "Transmit" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 12, + "type": "timeseries", + "title": "Storage usage across nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + } + }, + { + "id": 13, + "type": "text", + "title": "About this dashboard", + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 39 + }, + "options": { + "content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health", + "mode": "markdown" + } } ], "refresh": "30s", @@ -301,10 +612,8 @@ data: "from": "now-12h", "to": "now" }, - "timepicker": {}, - "timezone": "", "title": "Atlas Public Overview", "uid": "atlas-public", - "version": 1, - "weekStart": "" + "version": 3 } + diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml index d146275..b46c17a 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -20,29 +20,41 @@ data: "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", - "type": "dashboard" + "type": "dashboard" } ] }, "editable": true, - "fiscalYearStartMonth": 0, + "folderUid": "atlas-sre", "graphTooltip": 0, "links": [], "panels": [ { + "id": 1, + "type": "stat", + "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, - "description": "Percentage of Ready nodes.", + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "refId": "A" + } + ], "fieldConfig": { "defaults": { "color": { - "mode": "continuous-RdYlGr" + "mode": "palette-classic" }, "mappings": [], - "max": 100, - "min": 0, "thresholds": { "mode": "percentage", "steps": [ @@ -50,9 +62,13 @@ data: "color": "red", "value": null }, + { + "color": "yellow", + "value": 95 + }, { "color": "green", - "value": 90 + "value": 99 } ] }, @@ -60,18 +76,10 @@ data: }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 10, "options": { "colorMode": "value", - "graphMode": "none", + "graphMode": "area", "justifyMode": "center", - "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" @@ -79,92 +87,192 @@ data: "fields": "", "values": false } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100", - "refId": "A" - } - ], - "title": "Ready nodes", - "type": "stat" + } }, { + "id": 2, + "type": "stat", + "title": "Pending pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 5, "w": 6, "x": 6, "y": 0 }, - "id": 11, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})", - "legendFormat": "{{node}}", + "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})", "refId": "A" } ], - "title": "Free root filesystem bytes", - "type": "timeseries" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } }, { + "id": 3, + "type": "stat", + "title": "Unavailable deployment replicas", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 7 - }, - "id": 12, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } + "h": 5, + "w": 6, + "x": 12, + "y": 0 }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)", - "legendFormat": "{{pod}}", + "expr": "sum(kube_deployment_status_replicas_unavailable)", "refId": "A" } ], - "title": "Crypto namespace CPU usage", - "type": "timeseries" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } }, { + "id": 4, + "type": "stat", + "title": "Active alerts", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node CPU usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -173,9 +281,168 @@ data: "h": 9, "w": 12, "x": 0, - "y": 17 + "y": 5 + }, + "targets": [ + { + "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 6, + "type": "timeseries", + "title": "Node memory usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 7, + "type": "timeseries", + "title": "Top pod CPU (5m avg)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cores" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Top pod memory working set", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 9, + "type": "bargauge", + "title": "Namespace restart rate (6h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] }, - "id": 13, "options": { "displayMode": "gradient", "orientation": "horizontal", @@ -185,22 +452,112 @@ data: ], "fields": "", "values": false - }, - "showUnfilled": false + } + } + }, + { + "id": 10, + "type": "table", + "title": "Deployments missing replicas", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))", - "legendFormat": "", + "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))", "refId": "A" } ], - "title": "Namespaces with failed pods", - "type": "bargauge" + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "showHeader": true + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Pod phase breakdown", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 31 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase) by (phase)", + "refId": "A", + "legendFormat": "{{phase}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 12, + "type": "timeseries", + "title": "PVC usage (top 8)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 31 + }, + "targets": [ + { + "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))", + "refId": "A", + "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } } ], "schemaVersion": 39, @@ -216,8 +573,8 @@ data: "from": "now-12h", "to": "now" }, - "timepicker": {}, "title": "Atlas SRE Overview", "uid": "atlas-sre", - "version": 1 + "version": 2 } + diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml new file mode 100644 index 0000000..503aaee --- /dev/null +++ b/services/monitoring/grafana-folders.yaml @@ -0,0 +1,28 @@ +# services/monitoring/grafana-folders.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-folders + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: folders +data: + folders.yaml: | + apiVersion: 1 + folders: + - uid: atlas-public + title: Atlas Public + permissions: + - role: Viewer + permission: View + - role: Editor + permission: Edit + - role: Admin + permission: Admin + - uid: atlas-sre + title: Atlas SRE + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 266ddcd..4efae70 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -297,6 +297,11 @@ spec: dashboardsConfigMaps: public: grafana-dashboard-public sre: grafana-dashboard-sre + extraConfigmapMounts: + - name: grafana-folders + mountPath: /etc/grafana/provisioning/folders + configMap: grafana-folders + readOnly: true --- diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index bb321b5..73e7d23 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -7,4 +7,5 @@ resources: - rbac.yaml - grafana-dashboard-public.yaml - grafana-dashboard-sre.yaml + - grafana-folders.yaml - helmrelease.yaml