# services/monitoring/grafana-dashboard-overview.yaml apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboard-overview labels: grafana_dashboard: "1" data: atlas-overview.json: | { "uid": "atlas-overview", "title": "Atlas Overview", "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": false, "folderUid": "atlas-overview", "graphTooltip": 0, "links": [ { "title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": false }, { "title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": false }, { "title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": false } ], "panels": [ { "id": 1, "type": "stat", "title": "Running pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 0, "y": 0 }, "targets": [ { "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 2, "type": "stat", "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 4, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 3, "type": "stat", "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 8, "y": 0 }, "targets": [ { "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 4, "type": "stat", "title": "Control plane schedulable", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 12, "y": 0 }, "targets": [ { "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 5, "type": "stat", "title": "Problem pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 16, "y": 0 }, "targets": [ { "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 6, "type": "stat", "title": "Stuck terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 20, "y": 0 }, "targets": [ { "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "none" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 7, "type": "stat", "title": "Hottest node: CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 24, "y": 0 }, "targets": [ { "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value_and_name" } }, { "id": 8, "type": "stat", "title": "Hottest node: RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, "w": 4, "x": 28, "y": 0 }, "targets": [ { "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value_and_name" } }, { "id": 9, "type": "piechart", "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 5 }, "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 10, "type": "piechart", "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 5 }, "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", "refId": "A", "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "list", "placement": "right" }, "pieType": "pie", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 11, "type": "timeseries", "title": "Cluster node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, "targets": [ { "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } } }, { "id": 12, "type": "timeseries", "title": "Cluster node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, "targets": [ { "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } } }, { "id": 13, "type": "table", "title": "Problem pods (details)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, "targets": [ { "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 14, "type": "table", "title": "Terminating >10m", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, "targets": [ { "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} }, { "id": "filterByValue", "options": { "match": "Value", "operator": "gt", "value": 600 } } ] }, { "id": 15, "type": "timeseries", "title": "Control plane CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "targets": [ { "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 16, "type": "timeseries", "title": "Control plane RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "targets": [ { "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } } }, { "id": 17, "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 37 }, "targets": [ { "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": [ "last" ] }, "tooltip": { "mode": "multi" } }, "timeFrom": "7d" }, { "id": 18, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 37 }, "targets": [ { "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] } }, "overrides": [] }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false } } }, { "id": 19, "type": "stat", "title": "Astreae usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 0, "y": 45 }, "targets": [ { "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 20, "type": "stat", "title": "Asteria usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 6, "y": 45 }, "targets": [ { "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "percentage", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 21, "type": "stat", "title": "Astreae free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 12, "y": 45 }, "targets": [ { "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "bytesSI" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 22, "type": "stat", "title": "Asteria free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, "w": 6, "x": 18, "y": 45 }, "targets": [ { "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "rgba(115, 115, 115, 1)", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "bytesSI" }, "overrides": [] }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "value" } }, { "id": 23, "type": "table", "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 51 }, "targets": [ { "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 24, "type": "table", "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 51 }, "targets": [ { "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", "refId": "A" } ], "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, "options": { "showHeader": true }, "transformations": [ { "id": "labelsToFields", "options": {} } ] }, { "id": 25, "type": "text", "title": "About this dashboard", "gridPos": { "h": 5, "w": 24, "x": 0, "y": 59 }, "datasource": null, "options": { "mode": "markdown", "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" } } ], "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": [ "atlas", "overview" ], "templating": { "list": [] }, "time": { "from": "now-12h", "to": "now" } }