diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml index 126b1b3..35fa124 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -38,8 +38,8 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, + "h": 5, + "w": 4, "x": 0, "y": 0 }, @@ -82,26 +82,27 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { "id": 2, "type": "stat", - "title": "Ready node percentage", + "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, - "x": 6, + "h": 5, + "w": 4, + "x": 4, "y": 0 }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "refId": "A" } ], @@ -112,23 +113,19 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "red", + "color": "rgba(115, 115, 115, 1)", "value": null }, - { - "color": "orange", - "value": 90 - }, { "color": "green", - "value": 98 + "value": 1 } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, @@ -142,26 +139,27 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { "id": 3, "type": "stat", - "title": "Cluster CPU saturation", + "title": "Cluster nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, - "x": 12, + "h": 5, + "w": 4, + "x": 8, "y": 0 }, "targets": [ { - "expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "expr": "count(kube_node_info)", "refId": "A" } ], @@ -172,23 +170,19 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 65 - }, - { - "color": "red", - "value": 85 + "color": "green", + "value": 1 } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, @@ -202,26 +196,27 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { "id": 4, "type": "stat", - "title": "Cluster memory usage", + "title": "Hottest node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, - "x": 18, + "h": 5, + "w": 4, + "x": 12, "y": 0 }, "targets": [ { - "expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)", + "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))", "refId": "A" } ], @@ -262,11 +257,134 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value_and_name" } }, { "id": 5, + "type": "stat", + "title": "Hottest node memory", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 6, + "type": "stat", + "title": "Failed pods (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -277,11 +395,11 @@ data: "h": 9, "w": 12, "x": 0, - "y": 6 + "y": 5 }, "targets": [ { - "expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", "refId": "A" } ], @@ -307,7 +425,7 @@ data: } }, { - "id": 6, + "id": 8, "type": "piechart", "title": "Namespace memory share", "datasource": { @@ -318,11 +436,11 @@ data: "h": 9, "w": 12, "x": 12, - "y": 6 + "y": 5 }, "targets": [ { - "expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", "refId": "A" } ], @@ -348,7 +466,7 @@ data: } }, { - "id": 7, + "id": 9, "type": "timeseries", "title": "Node CPU usage (per node)", "datasource": { @@ -359,13 +477,13 @@ data: "h": 8, "w": 12, "x": 0, - "y": 15 + "y": 14 }, "targets": [ { - "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -385,7 +503,7 @@ data: } }, { - "id": 8, + "id": 10, "type": "timeseries", "title": "Node memory usage (per node)", "datasource": { @@ -396,13 +514,13 @@ data: "h": 8, "w": 12, "x": 12, - "y": 15 + "y": 14 }, "targets": [ { - "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -422,7 +540,7 @@ data: } }, { - "id": 9, + "id": 11, "type": "table", "title": "Key service availability", "datasource": { @@ -430,46 +548,23 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 23 + "y": 22 }, "targets": [ { - "expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})", + "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"})), \"service\", \"$1\", \"deployment\", \"(.*)\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"})), \"service\", \"$1\", \"statefulset\", \"(.*)\")", "refId": "A" } ], "fieldConfig": { "defaults": { - "mappings": [ - { - "id": 0, - "type": 1, - "value": "0", - "text": "Down" - }, - { - "id": 1, - "type": 1, - "value": "1", - "text": "Up" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - } + "custom": { + "align": "auto" + }, + "unit": "percent" }, "overrides": [] }, @@ -478,22 +573,22 @@ data: } }, { - "id": 10, + "id": 12, "type": "table", - "title": "Failed pods (24h trend)", + "title": "Failed pods by namespace (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 23 + "y": 22 }, "targets": [ { - "expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))", + "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[24h])))", "refId": "A" } ], @@ -508,9 +603,9 @@ data: } }, { - "id": 11, + "id": 13, "type": "timeseries", - "title": "Cluster network throughput", + "title": "Root filesystem usage per node", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -519,23 +614,18 @@ data: "h": 8, "w": 12, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))", + "expr": "avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "Receive" - }, - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))", - "refId": "B", - "legendFormat": "Transmit" + "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { - "unit": "Bps" + "unit": "percent" }, "overrides": [] }, @@ -550,9 +640,9 @@ data: } }, { - "id": 12, - "type": "timeseries", - "title": "Storage usage across nodes", + "id": 14, + "type": "bargauge", + "title": "Nodes closest to full root disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -561,40 +651,377 @@ data: "h": 8, "w": 12, "x": 12, - "y": 31 + "y": 29 }, "targets": [ { - "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)", + "expr": "topk(8, avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", "refId": "A" } ], "fieldConfig": { "defaults": { + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 15, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"})) * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, "unit": "percent" }, "overrides": [] }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 16, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"})) * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 17, + "type": "stat", + "title": "Astreae schedulable", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"astreae-.*\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 18, + "type": "stat", + "title": "Asteria schedulable", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"asteria-.*\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 19, + "type": "piechart", + "title": "Longhorn node readiness", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "sum(longhorn_node_status{condition=\"ready\"})", + "refId": "A", + "legendFormat": "Ready" + }, + { + "expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\"ready\"}))", + "refId": "B", + "legendFormat": "Offline" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, "options": { "legend": { "displayMode": "list", - "placement": "bottom" + "placement": "right" + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false } } }, { - "id": 13, + "id": 20, + "type": "piechart", + "title": "Longhorn disk schedulability", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 44 + }, + "targets": [ + { + "expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"}))", + "refId": "A", + "legendFormat": "Schedulable" + }, + { + "expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\"ready\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"})))", + "refId": "B", + "legendFormat": "Blocked" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 21, "type": "text", "title": "About this dashboard", "gridPos": { - "h": 6, + "h": 5, "w": 24, "x": 0, - "y": 39 + "y": 51 }, "options": { - "content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health", - "mode": "markdown" + "mode": "markdown", + "content": "### Atlas at a glance\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\n- Login for the SRE view with alert routing, Longhorn drilldowns, and workload burn rates" } } ], @@ -614,6 +1041,5 @@ data: }, "title": "Atlas Public Overview", "uid": "atlas-public", - "version": 3 + "version": 5 } - diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml index b46c17a..d5d8dca 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -45,7 +45,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100", "refId": "A" } ], @@ -86,7 +86,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -146,7 +147,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -206,7 +208,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -266,7 +269,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -285,9 +289,9 @@ data: }, "targets": [ { - "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -322,9 +326,9 @@ data: }, "targets": [ { - "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -575,6 +579,5 @@ data: }, "title": "Atlas SRE Overview", "uid": "atlas-sre", - "version": 2 + "version": 4 } -