From 06337f2b9d4c05f923169e9e232700a6baf0c35c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 14 Nov 2025 00:02:59 -0300
Subject: [PATCH 01/71] monitoring: add grafana and alertmanager

---
 clusters/atlas/flux-system/gotk-sync.yaml     |   2 +-
 services/monitoring/README.md                 |  15 ++
 .../monitoring/grafana-dashboard-public.yaml  | 227 ++++++++++++++++++
 .../monitoring/grafana-dashboard-sre.yaml     | 223 +++++++++++++++++
 services/monitoring/helmrelease.yaml          | 131 ++++++++++
 services/monitoring/kustomization.yaml        |   2 +
 6 files changed, 599 insertions(+), 1 deletion(-)
 create mode 100644 services/monitoring/README.md
 create mode 100644 services/monitoring/grafana-dashboard-public.yaml
 create mode 100644 services/monitoring/grafana-dashboard-sre.yaml

diff --git a/clusters/atlas/flux-system/gotk-sync.yaml b/clusters/atlas/flux-system/gotk-sync.yaml
index 473ab99..46f65d3 100644
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@@ -8,7 +8,7 @@ metadata:
 spec:
   interval: 1m0s
   ref:
-    branch: main
+    branch: feature/atlas-monitoring
   secretRef:
     name: flux-system-gitea
   url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
diff --git a/services/monitoring/README.md b/services/monitoring/README.md
new file mode 100644
index 0000000..74baf08
--- /dev/null
+++ b/services/monitoring/README.md
@@ -0,0 +1,15 @@
+# services/monitoring
+
+## Grafana admin secret
+
+The Grafana Helm release expects a pre-existing secret named `grafana-admin`
+in the `monitoring` namespace. Create or rotate it with:
+
+```bash
+kubectl create secret generic grafana-admin \
+  --namespace monitoring \
+  --from-literal=admin-user=admin \
+  --from-literal=admin-password='REPLACE_ME'
+```
+
+Update the password whenever you rotate credentials.
diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml
new file mode 100644
index 0000000..db5d6c1
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-public.yaml
@@ -0,0 +1,227 @@
+# services/monitoring/grafana-dashboard-public.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-public
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-public-overview.json: |
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "datasource": {
+              "type": "datasource",
+              "uid": "grafana"
+            },
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "type": "dashboard"
+          }
+        ]
+      },
+      "editable": false,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 0,
+      "id": null,
+      "links": [],
+      "liveNow": false,
+      "panels": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "id": 1,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "text": {},
+            "textMode": "auto"
+          },
+          "pluginVersion": "10.4.0",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "editorMode": "code",
+              "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
+              "legendFormat": "",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "Running pods",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "description": "Aggregated CPU usage across all schedulable nodes.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "continuous-BlYlRd"
+              },
+              "mappings": [],
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 60
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 6,
+            "y": 0
+          },
+          "id": 2,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "text": {},
+            "textMode": "auto"
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))",
+              "legendFormat": "",
+              "refId": "A"
+            }
+          ],
+          "title": "Average node CPU",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 7
+          },
+          "id": 3,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom",
+              "showLegend": true
+            },
+            "tooltip": {
+              "mode": "single",
+              "sort": "none"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)",
+              "legendFormat": "{{namespace}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Namespace CPU (5m avg)",
+          "type": "timeseries"
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "public"
+      ],
+      "templating": {
+        "list": []
+      },
+      "time": {
+        "from": "now-6h",
+        "to": "now"
+      },
+      "timepicker": {},
+      "timezone": "",
+      "title": "Atlas Public Overview",
+      "uid": "atlas-public",
+      "version": 1,
+      "weekStart": ""
+    }
diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml
new file mode 100644
index 0000000..12995af
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-sre.yaml
@@ -0,0 +1,223 @@
+# services/monitoring/grafana-dashboard-sre.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-sre
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-sre-overview.json: |
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "datasource": {
+              "type": "datasource",
+              "uid": "grafana"
+            },
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+              "type": "dashboard"
+          }
+        ]
+      },
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 0,
+      "links": [],
+      "panels": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "description": "Percentage of Ready nodes.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "continuous"
+              },
+              "mappings": [],
+              "max": 100,
+              "min": 0,
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 90
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "id": 10,
+          "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "center",
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100",
+              "refId": "A"
+            }
+          ],
+          "title": "Ready nodes",
+          "type": "stat"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 6,
+            "y": 0
+          },
+          "id": 11,
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})",
+              "legendFormat": "{{node}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Free root filesystem bytes",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 10,
+            "w": 12,
+            "x": 0,
+            "y": 7
+          },
+          "id": 12,
+          "options": {
+            "legend": {
+              "calcs": [],
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "single"
+            }
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)",
+              "legendFormat": "{{pod}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Crypto namespace CPU usage",
+          "type": "timeseries"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 17
+          },
+          "id": 13,
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "showUnfilled": false
+          },
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))",
+              "legendFormat": "",
+              "refId": "A"
+            }
+          ],
+          "title": "Namespaces with failed pods",
+          "type": "bargauge"
+        }
+      ],
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "sre"
+      ],
+      "templating": {
+        "list": []
+      },
+      "time": {
+        "from": "now-12h",
+        "to": "now"
+      },
+      "timepicker": {},
+      "title": "Atlas SRE Overview",
+      "uid": "atlas-sre",
+      "version": 1
+    }
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 22bc2b1..3341e9d 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -210,3 +210,134 @@ spec:
                 - action: keep
                   source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
                   regex: flux-system;flux
+
+---
+
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  interval: 15m
+  chart:
+    spec:
+      chart: grafana
+      version: "~8.5.0"
+      sourceRef:
+        kind: HelmRepository
+        name: grafana
+        namespace: flux-system
+  values:
+    admin:
+      existingSecret: grafana-admin
+      userKey: admin-user
+      passwordKey: admin-password
+    persistence:
+      enabled: true
+      size: 20Gi
+      storageClassName: astreae
+    service:
+      type: ClusterIP
+    env:
+      - name: GF_AUTH_ANONYMOUS_ENABLED
+        value: "true"
+      - name: GF_AUTH_ANONYMOUS_ORG_ROLE
+        value: Viewer
+      - name: GF_SECURITY_ALLOW_EMBEDDING
+        value: "true"
+    grafana.ini:
+      server:
+        domain: reporting.bstein.dev
+        root_url: https://reporting.bstein.dev/
+      auth.anonymous:
+        hide_version: true
+      users:
+        default_theme: dark
+    ingress:
+      enabled: true
+      ingressClassName: traefik
+      annotations:
+        cert-manager.io/cluster-issuer: letsencrypt
+      hosts:
+        - reporting.bstein.dev
+      tls:
+        - secretName: grafana-reporting-tls
+          hosts:
+            - reporting.bstein.dev
+    datasources:
+      datasources.yaml:
+        apiVersion: 1
+        datasources:
+          - name: VictoriaMetrics
+            type: prometheus
+            access: proxy
+            url: http://victoria-metrics-single-server:8428
+            isDefault: true
+            jsonData:
+              timeInterval: "15s"
+    dashboardProviders:
+      dashboardproviders.yaml:
+        apiVersion: 1
+        providers:
+          - name: public
+            orgId: 1
+            folder: Atlas Public
+            type: file
+            disableDeletion: false
+            allowUiUpdates: false
+            options:
+              path: /var/lib/grafana/dashboards/public
+          - name: sre
+            orgId: 1
+            folder: Atlas SRE
+            type: file
+            disableDeletion: false
+            allowUiUpdates: true
+            options:
+              path: /var/lib/grafana/dashboards/sre
+    dashboardsConfigMaps:
+      - configMapName: grafana-dashboard-public
+        folder: public
+      - configMapName: grafana-dashboard-sre
+        folder: sre
+
+---
+
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: alertmanager
+  namespace: monitoring
+spec:
+  interval: 15m
+  chart:
+    spec:
+      chart: alertmanager
+      version: "~1.9.0"
+      sourceRef:
+        kind: HelmRepository
+        name: prometheus
+        namespace: flux-system
+  values:
+    ingress:
+      enabled: true
+      ingressClassName: traefik
+      annotations:
+        cert-manager.io/cluster-issuer: letsencrypt
+      hosts:
+        - alerts.bstein.dev
+      tls:
+        - secretName: alerts-bstein-dev-tls
+          hosts:
+            - alerts.bstein.dev
+    config:
+      global:
+        resolve_timeout: 5m
+      route:
+        receiver: default
+        group_wait: 30s
+        group_interval: 5m
+        repeat_interval: 2h
+      receivers:
+        - name: default
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 036afa3..bb321b5 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -5,4 +5,6 @@ namespace: monitoring
 resources:
   - namespace.yaml
   - rbac.yaml
+  - grafana-dashboard-public.yaml
+  - grafana-dashboard-sre.yaml
   - helmrelease.yaml

From c2cb9011024806df0449c3c102308ade6a6cb1b8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 14 Nov 2025 08:29:59 -0300
Subject: [PATCH 02/71] monitoring: fix grafana values

---
 services/monitoring/helmrelease.yaml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 3341e9d..9cac705 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -260,7 +260,9 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - reporting.bstein.dev
+        - host: reporting.bstein.dev
+          paths:
+            - /
       tls:
         - secretName: grafana-reporting-tls
           hosts:
@@ -297,10 +299,8 @@ spec:
             options:
               path: /var/lib/grafana/dashboards/sre
     dashboardsConfigMaps:
-      - configMapName: grafana-dashboard-public
-        folder: public
-      - configMapName: grafana-dashboard-sre
-        folder: sre
+      public: grafana-dashboard-public
+      sre: grafana-dashboard-sre
 
 ---
 
@@ -326,7 +326,9 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - alerts.bstein.dev
+        - host: alerts.bstein.dev
+          paths:
+            - /
       tls:
         - secretName: alerts-bstein-dev-tls
           hosts:

From 465103a57e30d63c2e94f276762b0d8972ec80e7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 14 Nov 2025 08:33:53 -0300
Subject: [PATCH 03/71] grafana: fix dashboard provider list

---
 services/monitoring/helmrelease.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 9cac705..e9b6154 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -287,7 +287,7 @@ spec:
             folder: Atlas Public
             type: file
             disableDeletion: false
-            allowUiUpdates: false
+            editable: false
             options:
               path: /var/lib/grafana/dashboards/public
           - name: sre
@@ -295,7 +295,7 @@ spec:
             folder: Atlas SRE
             type: file
             disableDeletion: false
-            allowUiUpdates: true
+            editable: true
             options:
               path: /var/lib/grafana/dashboards/sre
     dashboardsConfigMaps:

From 394fcf2ee4a7131f38add30029c432ef287c7c8e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 14 Nov 2025 08:37:46 -0300
Subject: [PATCH 04/71] grafana: use string host format

---
 services/monitoring/helmrelease.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index e9b6154..91cf0ce 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -260,9 +260,8 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - host: reporting.bstein.dev
-          paths:
-            - /
+        - reporting.bstein.dev
+      path: /
       tls:
         - secretName: grafana-reporting-tls
           hosts:

From 418329e17337522bbc27fc7e1e71fb3d061f2278 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 14 Nov 2025 08:51:09 -0300
Subject: [PATCH 05/71] monitoring: fix ingress and env formats

---
 services/monitoring/helmrelease.yaml | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 91cf0ce..b176c64 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -240,12 +240,9 @@ spec:
     service:
       type: ClusterIP
     env:
-      - name: GF_AUTH_ANONYMOUS_ENABLED
-        value: "true"
-      - name: GF_AUTH_ANONYMOUS_ORG_ROLE
-        value: Viewer
-      - name: GF_SECURITY_ALLOW_EMBEDDING
-        value: "true"
+      GF_AUTH_ANONYMOUS_ENABLED: "true"
+      GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
+      GF_SECURITY_ALLOW_EMBEDDING: "true"
     grafana.ini:
       server:
         domain: reporting.bstein.dev
@@ -327,7 +324,8 @@ spec:
       hosts:
         - host: alerts.bstein.dev
           paths:
-            - /
+            - path: /
+              pathType: Prefix
       tls:
         - secretName: alerts-bstein-dev-tls
           hosts:

From 3cfe6393872ac21a6a30163480fb8cbb19518226 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 14 Nov 2025 19:13:40 -0300
Subject: [PATCH 06/71] monitoring: fix domain

---
 scripts/styx_prep_nvme_luks.sh       | 575 +++++++++++++++++++++++++++
 services/monitoring/helmrelease.yaml |  12 +-
 2 files changed, 581 insertions(+), 6 deletions(-)
 create mode 100755 scripts/styx_prep_nvme_luks.sh

diff --git a/scripts/styx_prep_nvme_luks.sh b/scripts/styx_prep_nvme_luks.sh
new file mode 100755
index 0000000..d5ea0c5
--- /dev/null
+++ b/scripts/styx_prep_nvme_luks.sh
@@ -0,0 +1,575 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# --- CONFIG (edit if needed) ---
+# Leave NVME empty → script will auto-detect the SSK dock.
+NVME="${NVME:-}"
+FLAVOR="${FLAVOR:-desktop}"
+# Persistent cache so the image survives reboots.
+IMG_DIR="${IMG_DIR:-/var/cache/styx-rpi}"
+IMG_FILE="${IMG_FILE:-ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img}"
+IMG_BOOT_MNT="${IMG_BOOT_MNT:-/mnt/img-boot}"
+IMG_ROOT_MNT="${IMG_ROOT_MNT:-/mnt/img-root}"
+TGT_ROOT="/mnt/target-root"
+TGT_BOOT="/mnt/target-boot"
+
+STYX_USER="styx"
+STYX_HOSTNAME="titan-ag"
+STYX_PASS="TempPass#123"                # will be forced to change on first login via cloud-init
+SSH_PUBKEY="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion"
+
+# Video / input prefs
+DSI_FLAGS="video=DSI-1:800x480@60D video=HDMI-A-1:off video=HDMI-A-2:off"
+
+# --- Helpers ---
+fatal(){ echo "ERROR: $*" >&2; exit 1; }
+need(){ command -v "$1" >/dev/null || fatal "Missing tool: $1"; }
+
+require_root(){ [[ $EUID -eq 0 ]] || exec sudo -E "$0" "$@"; }
+
+part() {
+  local n="$1"
+  if [[ "$NVME" =~ [0-9]$ ]]; then
+    echo "${NVME}p${n}"
+  else
+    echo "${NVME}${n}"
+  fi
+}
+
+auto_detect_target_disk() {
+  # If user already set NVME, validate and return
+  if [[ -n "${NVME:-}" ]]; then
+    [[ -b "$NVME" ]] || fatal "NVME='$NVME' is not a block device"
+    return
+  fi
+
+  # Prefer stable by-id symlinks
+  local byid
+  byid=$(ls -1 /dev/disk/by-id/usb-SSK* 2>/dev/null | head -n1 || true)
+  if [[ -n "$byid" ]]; then
+    NVME=$(readlink -f "$byid")
+  else
+    # Heuristic via lsblk -S: look for USB with SSK/Ingram/Storage in vendor/model
+    NVME=$(lsblk -S -p -o NAME,TRAN,VENDOR,MODEL | \
+             awk '/ usb / && ($3 ~ /SSK|Ingram/i || $4 ~ /SSK|Storage/i){print $1; exit}')
+  fi
+
+  [[ -n "${NVME:-}" && -b "$NVME" ]] || fatal "Could not auto-detect SSK USB NVMe dock. Export NVME=/dev/sdX and re-run."
+  echo "Auto-detected target disk: $NVME"
+}
+
+preflight_cleanup() {
+  local img="$IMG_DIR/$IMG_FILE"
+
+  # 1) Unmount image mountpoints and detach only loops for this IMG
+  umount -lf "$IMG_BOOT_MNT" "$IMG_ROOT_MNT" 2>/dev/null || true
+  # losetup -j exits non-zero if no association → tolerate it
+  { losetup -j "$img" | cut -d: -f1 | xargs -r losetup -d; } 2>/dev/null || true
+
+  # 2) Unmount our target mounts
+  umount -lf "$TGT_ROOT/boot/firmware" "$TGT_BOOT" "$TGT_ROOT" 2>/dev/null || true
+
+  # 3) Unmount the actual target partitions if mounted anywhere (tolerate 'not found')
+  for p in "$(part 1)" "$(part 2)"; do
+    # findmnt returns 1 when no match → capture and iterate if any
+    while read -r mnt; do
+      [ -n "$mnt" ] && umount -lf "$mnt" 2>/dev/null || true
+    done < <(findmnt -rno TARGET -S "$p" 2>/dev/null || true)
+  done
+
+  # 4) Close dm-crypt mapping (if it exists)
+  cryptsetup luksClose cryptroot 2>/dev/null || true
+  dmsetup remove -f cryptroot 2>/dev/null || true
+
+  # 5) Let udev settle
+  command -v udevadm >/dev/null && udevadm settle || true
+}
+
+guard_target_device() {
+  # Refuse to operate if NVME appears to be the current system disk
+  local root_src root_disk
+  root_src=$(findmnt -no SOURCE /)
+  root_disk=$(lsblk -no pkname "$root_src" 2>/dev/null || true)
+  if [[ -n "$root_disk" && "/dev/$root_disk" == "$NVME" ]]; then
+    fatal "Refusing to operate on system disk ($NVME). Pick the external NVMe."
+  fi
+}
+
+need_host_fido2() {
+  if ! command -v fido2-token >/dev/null 2>&1; then
+    echo "Host is missing fido2-token. On Arch: sudo pacman -S libfido2"
+    echo "On Debian/Ubuntu host: sudo apt-get install fido2-tools"
+    exit 1
+  fi
+}
+
+ensure_image() {
+  mkdir -p "$IMG_DIR"
+  chmod 755 "$IMG_DIR"
+
+  local BASE="https://cdimage.ubuntu.com/releases/noble/release"
+  local XZ="ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img.xz"
+
+  # If the decompressed .img is missing, fetch/decompress into the cache.
+  if [[ ! -f "$IMG_DIR/$IMG_FILE" ]]; then
+    need curl; need unxz  # Arch: pacman -S curl xz   |  Ubuntu: apt-get install curl xz-utils
+    if [[ ! -f "$IMG_DIR/$XZ" ]]; then
+      echo "Fetching image…"
+      curl -fL -o "$IMG_DIR/$XZ" "$BASE/$XZ"
+    fi
+    echo "Decompressing to $IMG_DIR/$IMG_FILE …"
+    # Keep the .xz for future runs; stream-decompress to the .img
+    if command -v unxz >/dev/null 2>&1; then
+      unxz -c "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE"
+    else
+      need xz
+      xz -dc "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE"
+    fi
+    sync
+  else
+    echo "Using cached image: $IMG_DIR/$IMG_FILE"
+  fi
+}
+
+ensure_binfmt_aarch64(){
+  # Register qemu-aarch64 for chrooted ARM64 apt runs
+  if [[ ! -e /proc/sys/fs/binfmt_misc/qemu-aarch64 ]]; then
+    need docker
+    systemctl enable --now docker >/dev/null 2>&1 || true
+    docker run --rm --privileged tonistiigi/binfmt --install arm64 >/dev/null
+  fi
+  if [[ ! -x /usr/local/bin/qemu-aarch64-static ]]; then
+    docker rm -f qemu-static >/dev/null 2>&1 || true
+    docker create --name qemu-static docker.io/multiarch/qemu-user-static:latest >/dev/null
+    docker cp qemu-static:/usr/bin/qemu-aarch64-static /usr/local/bin/
+    install -D -m755 /usr/local/bin/qemu-aarch64-static /usr/local/bin/qemu-aarch64-static
+    docker rm qemu-static >/dev/null
+  fi
+}
+
+open_image() {
+  [[ -r "$IMG_DIR/$IMG_FILE" ]] || fatal "Image not found: $IMG_DIR/$IMG_FILE"
+  mkdir -p "$IMG_BOOT_MNT" "$IMG_ROOT_MNT"
+
+  # Pre-clean: detach any previous loop(s) for this image (tolerate absence)
+  umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true
+  umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true
+  # If no loop is attached, losetup -j returns non-zero → swallow it
+  mapfile -t OLD < <({ losetup -j "$IMG_DIR/$IMG_FILE" | cut -d: -f1; } 2>/dev/null || true)
+  for L in "${OLD[@]:-}"; do losetup -d "$L" 2>/dev/null || true; done
+  command -v udevadm >/dev/null && udevadm settle || true
+
+  # Attach with partition scan; wait for partition nodes to exist
+  LOOP=$(losetup --find --show --partscan "$IMG_DIR/$IMG_FILE") || fatal "losetup failed"
+  command -v udevadm >/dev/null && udevadm settle || true
+  for _ in {1..25}; do
+    [[ -b "${LOOP}p1" && -b "${LOOP}p2" ]] && break
+    sleep 0.1
+    command -v udevadm >/dev/null && udevadm settle || true
+  done
+  [[ -b "${LOOP}p1" ]] || fatal "loop partitions not present for $LOOP"
+
+  # Cleanup on exit: unmount first, then detach loop (tolerate absence)
+  trap 'umount -lf "'"$IMG_BOOT_MNT"'" "'"$IMG_ROOT_MNT"'" 2>/dev/null; losetup -d "'"$LOOP"'" 2>/dev/null' EXIT
+
+  # Mount image partitions read-only
+  mount -o ro "${LOOP}p1" "$IMG_BOOT_MNT"
+  mount -o ro "${LOOP}p2" "$IMG_ROOT_MNT"
+
+  # Sanity checks without using failing pipelines
+  # start*.elf must exist
+  if ! compgen -G "$IMG_BOOT_MNT/start*.elf" > /dev/null; then
+    fatal "start*.elf not found in image"
+  fi
+  # vmlinuz-* must exist
+  if ! compgen -G "$IMG_ROOT_MNT/boot/vmlinuz-*" > /dev/null; then
+    fatal "vmlinuz-* not found in image root"
+  fi
+}
+
+confirm_and_wipe(){
+  lsblk -o NAME,SIZE,MODEL,TRAN,LABEL "$NVME"
+  read -rp "Type EXACTLY 'WIPE' to destroy ALL DATA on $NVME: " ACK
+  [[ "$ACK" == "WIPE" ]] || fatal "Aborted"
+  wipefs -a "$NVME"
+  sgdisk -Zo "$NVME"
+  # GPT: 1: 1MiB..513MiB vfat ESP; 2: rest LUKS
+  parted -s "$NVME" mklabel gpt \
+    mkpart system-boot fat32 1MiB 513MiB set 1 esp on \
+    mkpart cryptroot 513MiB 100%
+  partprobe "$NVME"; sleep 1
+  mkfs.vfat -F32 -n system-boot "$(part 1)"
+}
+
+setup_luks(){
+  echo "Create LUKS2 on $(part 2) (you will be prompted for a passphrase; keep it as fallback)"
+  need cryptsetup
+  cryptsetup luksFormat --type luks2 "$(part 2)"
+  cryptsetup open "$(part 2)" cryptroot
+  mkfs.ext4 -L rootfs /dev/mapper/cryptroot
+}
+
+mount_targets(){
+  mkdir -p "$TGT_ROOT" "$TGT_BOOT"
+  mount /dev/mapper/cryptroot "$TGT_ROOT"
+  mkdir -p "$TGT_ROOT/boot/firmware"
+  mount "$(part 1)" "$TGT_BOOT"
+  mount --bind "$TGT_BOOT" "$TGT_ROOT/boot/firmware"
+}
+
+rsync_root_and_boot(){
+  need rsync
+  rsync -aAXH --numeric-ids --delete \
+  --exclude='/boot/firmware' --exclude='/boot/firmware/**' \
+  --exclude='/dev/*' --exclude='/proc/*' --exclude='/sys/*' \
+  --exclude='/run/*' --exclude='/tmp/*' --exclude='/mnt/*' \
+  --exclude='/media/*' --exclude='/lost+found' \
+  "$IMG_ROOT_MNT"/ "$TGT_ROOT"/
+  rsync -aH --delete "$IMG_BOOT_MNT"/ "$TGT_ROOT/boot/firmware"/
+}
+
+write_crypttab_fstab(){
+  LUUID=$(blkid -s UUID -o value "$(part 2)")
+  printf 'cryptroot UUID=%s none luks,discard,fido2-device=auto\n' "$LUUID" > "$TGT_ROOT/etc/crypttab"
+  cat > "$TGT_ROOT/etc/fstab" <<EOF
+/dev/mapper/cryptroot  /               ext4  defaults,discard,errors=remount-ro  0 1
+LABEL=system-boot      /boot/firmware  vfat  defaults,umask=0077                 0 1
+EOF
+}
+
+fix_firmware_files(){
+  local C="$TGT_ROOT/boot/firmware/config.txt"
+  local CL="$TGT_ROOT/boot/firmware/cmdline.txt"
+  [[ -f "$C" ]] || fatal "missing $C"
+
+  # Always boot the uncompressed Pi 5 kernel
+  if grep -q '^kernel=' "$C"; then
+    sed -i 's#^kernel=.*#kernel=kernel_2712.img#' "$C"
+  else
+    sed -i '1i kernel=kernel_2712.img' "$C"
+  fi
+
+  # Ensure initramfs and cmdline indirection are set
+  grep -q '^initramfs ' "$C" || echo 'initramfs initrd.img followkernel' >> "$C"
+  grep -q '^cmdline=cmdline.txt' "$C" || sed -i '1i cmdline=cmdline.txt' "$C"
+
+  # Display & buses (Pi 5)
+  grep -q '^dtoverlay=vc4-kms-v3d-pi5' "$C" || echo 'dtoverlay=vc4-kms-v3d-pi5' >> "$C"
+  grep -q '^dtparam=i2c_arm=on'        "$C" || echo 'dtparam=i2c_arm=on' >> "$C"
+  grep -q '^dtparam=pciex1=on'         "$C" || echo 'dtparam=pciex1=on' >> "$C"
+  grep -q '^dtparam=pciex1_gen=2'      "$C" || echo 'dtparam=pciex1_gen=2' >> "$C"
+  grep -q '^enable_uart=1'             "$C" || echo 'enable_uart=1' >> "$C"
+
+  # Minimal, correct dracut hints using the bare UUID
+  local LUUID; LUUID=$(blkid -s UUID -o value "$(part 2)")
+  : > "$CL"
+  {
+    echo -n "rd.luks.uuid=$LUUID rd.luks.name=$LUUID=cryptroot "
+    echo -n "root=/dev/mapper/cryptroot rootfstype=ext4 rootwait fixrtc "
+    echo    "console=serial0,115200 console=tty1 ds=nocloud;s=file:///boot/firmware/ ${DSI_FLAGS} rd.debug"
+  } >> "$CL"
+}
+
+seed_cloud_init(){
+  # NoCloud seed to create user, lock down SSH, set hostname, and enable avahi.
+  cat > "$TGT_ROOT/boot/firmware/user-data" <<EOF
+#cloud-config
+hostname: $STYX_HOSTNAME
+manage_etc_hosts: true
+users:
+  - name: $STYX_USER
+    gecos: "$STYX_USER"
+    shell: /bin/bash
+    groups: [sudo,video,i2c]
+    sudo: ALL=(ALL) NOPASSWD:ALL
+    lock_passwd: false
+    ssh_authorized_keys:
+      - $SSH_PUBKEY
+chpasswd:
+  list: |
+    $STYX_USER:$STYX_PASS
+  expire: true
+ssh_pwauth: false
+package_update: true
+packages: [openssh-server, avahi-daemon]
+runcmd:
+  - systemctl enable --now ssh
+  - systemctl enable --now avahi-daemon || true
+EOF
+
+  # Minimal meta-data for NoCloud
+  date +%s | awk '{print "instance-id: iid-titan-ag-"$1"\nlocal-hostname: '"$STYX_HOSTNAME"'"}' \
+    > "$TGT_ROOT/boot/firmware/meta-data"
+}
+
+prep_chroot_mounts(){
+  for d in dev proc sys; do mount --bind "/$d" "$TGT_ROOT/$d"; done
+  mount -t devpts devpts "$TGT_ROOT/dev/pts"
+  # Replace the usual resolv.conf symlink with a real file for apt to work
+  rm -f "$TGT_ROOT/etc/resolv.conf"
+  cp /etc/resolv.conf "$TGT_ROOT/etc/resolv.conf"
+
+  # Block service starts (no systemd in chroot)
+  cat > "$TGT_ROOT/usr/sbin/policy-rc.d" <<'EOP'
+#!/bin/sh
+exit 101
+EOP
+  chmod +x "$TGT_ROOT/usr/sbin/policy-rc.d"
+
+  # Ensure qemu static is present inside chroot
+  install -D -m755 /usr/local/bin/qemu-aarch64-static "$TGT_ROOT/usr/bin/qemu-aarch64-static"
+}
+
+in_chroot(){
+  chroot "$TGT_ROOT" /usr/bin/qemu-aarch64-static /bin/bash -lc '
+set -euo pipefail
+export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
+
+# --- APT sources (ports) ---
+cat > /etc/apt/sources.list <<'"'"'EOS'"'"'
+deb http://ports.ubuntu.com/ubuntu-ports noble main restricted universe multiverse
+deb http://ports.ubuntu.com/ubuntu-ports noble-updates main restricted universe multiverse
+deb http://ports.ubuntu.com/ubuntu-ports noble-security main restricted universe multiverse
+EOS
+
+apt-get update
+
+# --- Remove snaps and pin them off ---
+apt-get -y purge snapd || true
+rm -rf /snap /var/snap /var/lib/snapd /home/*/snap || true
+mkdir -p /etc/apt/preferences.d
+cat > /etc/apt/preferences.d/nosnap.pref <<'"'"'EOS'"'"'
+Package: snapd
+Pin: release *
+Pin-Priority: -10
+EOS
+
+# --- Base tools (no flash-kernel; we use dracut) ---
+apt-get install -y --no-install-recommends \
+  openssh-client openssh-server openssh-sftp-server avahi-daemon \
+  cryptsetup dracut fido2-tools libfido2-1 i2c-tools \
+  python3-smbus python3-pil zbar-tools qrencode lm-sensors \
+  file zstd lz4 || true
+
+# Camera apps: try rpicam-apps; otherwise basic libcamera tools
+apt-get install -y rpicam-apps || apt-get install -y libcamera-tools || true
+
+# --- Persistent journal so we can read logs after failed boot ---
+mkdir -p /etc/systemd/journald.conf.d
+cat > /etc/systemd/journald.conf.d/99-persistent.conf <<'"'"'EOS'"'"'
+[Journal]
+Storage=persistent
+EOS
+
+# --- SSH hardening (ensure file exists even if package was half-installed) ---
+if [ ! -f /etc/ssh/sshd_config ]; then
+  mkdir -p /etc/ssh
+  cat > /etc/ssh/sshd_config <<'"'"'EOS'"'"'
+PermitRootLogin no
+PasswordAuthentication no
+KbdInteractiveAuthentication no
+PubkeyAuthentication yes
+# Accept defaults for the rest
+EOS
+fi
+sed -i -e "s/^#\?PasswordAuthentication .*/PasswordAuthentication no/" \
+       -e "s/^#\?KbdInteractiveAuthentication .*/KbdInteractiveAuthentication no/" \
+       -e "s/^#\?PermitRootLogin .*/PermitRootLogin no/" \
+       -e "s/^#\?PubkeyAuthentication .*/PubkeyAuthentication yes/" /etc/ssh/sshd_config || true
+
+# --- Hostname & hosts ---
+echo "'"$STYX_HOSTNAME"'" > /etc/hostname
+if grep -q "^127\\.0\\.1\\.1" /etc/hosts; then
+  sed -i "s/^127\\.0\\.1\\.1.*/127.0.1.1\t'"$STYX_HOSTNAME"'/" /etc/hosts
+else
+  echo -e "127.0.1.1\t'"$STYX_HOSTNAME"'" >> /etc/hosts
+fi
+
+# --- Enable services on first boot ---
+mkdir -p /etc/systemd/system/multi-user.target.wants
+ln -sf /lib/systemd/system/ssh.service /etc/systemd/system/multi-user.target.wants/ssh.service
+ln -sf /lib/systemd/system/avahi-daemon.service /etc/systemd/system/multi-user.target.wants/avahi-daemon.service || true
+
+# --- Ensure i2c group ---
+getent group i2c >/dev/null || groupadd i2c
+
+# --- Dracut configuration (generic, not host-only) ---
+mkdir -p /etc/dracut.conf.d
+cat > /etc/dracut.conf.d/00-hostonly.conf <<'"'"'EOS'"'"'
+hostonly=no
+EOS
+cat > /etc/dracut.conf.d/10-systemd-crypt.conf <<'"'"'EOS'"'"'
+add_dracutmodules+=" systemd crypt "
+EOS
+cat > /etc/dracut.conf.d/20-drivers.conf <<'"'"'EOS'"'"'
+add_drivers+=" nvme xhci_pci xhci_hcd usbhid hid_generic hid "
+EOS
+cat > /etc/dracut.conf.d/30-fido2.conf <<'"'"'EOS'"'"'
+install_items+="/usr/bin/systemd-cryptsetup /usr/bin/fido2-token /usr/lib/*/libfido2.so* /usr/lib/*/libcbor.so*"
+EOS
+
+# --- Build initramfs and place it where firmware expects it ---
+KVER=$(ls -1 /lib/modules | sort -V | tail -n1)
+dracut --force /boot/initramfs-$KVER.img $KVER
+ln -sf initramfs-$KVER.img /boot/initrd.img
+ln -sf initramfs-$KVER.img /boot/initrd.img-$KVER
+cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img
+
+# --- Create uncompressed kernel for Pi 5 firmware ---
+if [ -f "/usr/lib/linux-image-$KVER/Image" ]; then
+  cp -a "/usr/lib/linux-image-$KVER/Image" /boot/firmware/kernel_2712.img
+else
+  FMT=$(file -b "/boot/vmlinuz-$KVER" || true)
+  case "$FMT" in
+    *Zstandard*|*zstd*) zstd -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
+    *LZ4*)              lz4  -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
+    *gzip*)             zcat     "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;;
+    *)                  cp -a "/boot/vmlinuz-$KVER" /boot/firmware/kernel_2712.img ;;
+  esac
+fi
+
+# --- Ensure Pi 5 DTB is present on the boot partition ---
+DTB=$(find /lib/firmware -type f -name "bcm2712-rpi-5-b.dtb" | sort | tail -n1 || true)
+[ -n "$DTB" ] && cp -a "$DTB" /boot/firmware/
+
+# --- Dracut hook to copy rdsosreport.txt to the FAT partition on failure ---
+mkdir -p /usr/lib/dracut/modules.d/99copylog
+cat > /usr/lib/dracut/modules.d/99copylog/module-setup.sh <<'"'"'EOS'"'"'
+#!/bin/bash
+check() { return 0; }
+depends() { echo base; return 0; }
+install() {
+  # Guard $moddir for nounset; derive if absent
+  local mdir="${moddir:-$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)}"
+  inst_hook emergency 99 "$mdir/copylog.sh"
+}
+EOS
+chmod +x /usr/lib/dracut/modules.d/99copylog/module-setup.sh
+
+cat > /usr/lib/dracut/modules.d/99copylog/copylog.sh <<'"'"'EOS'"'"'
+#!/bin/sh
+set -e
+for dev in /dev/nvme0n1p1 /dev/sda1 /dev/sdb1 /dev/mmcblk0p1; do
+  [ -b "$dev" ] || continue
+  mkdir -p /mnt/bootfat
+  if mount -t vfat "$dev" /mnt/bootfat 2>/dev/null; then
+    if [ -s /run/initramfs/rdsosreport.txt ]; then
+      cp -f /run/initramfs/rdsosreport.txt /mnt/bootfat/rdsosreport.txt 2>/dev/null || true
+      sync || true
+    fi
+    umount /mnt/bootfat || true
+    break
+  fi
+done
+EOS
+chmod +x /usr/lib/dracut/modules.d/99copylog/copylog.sh
+
+# Rebuild to ensure the copylog module is included
+dracut --force /boot/initramfs-$KVER.img $KVER
+ln -sf initramfs-$KVER.img /boot/initrd.img
+cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img
+
+true
+'
+}
+
+verify_boot_assets(){
+  echo "---- verify boot assets on FAT ----"
+  file "$TGT_ROOT/boot/firmware/kernel_2712.img" || true
+  ls -lh "$TGT_ROOT/boot/firmware/initrd.img" || true
+  echo "-- config.txt (key lines) --"
+  grep -E '^(kernel|initramfs|cmdline)=|^dtoverlay=|^dtparam=' "$TGT_ROOT/boot/firmware/config.txt" || true
+  echo "-- cmdline.txt --"
+  cat "$TGT_ROOT/boot/firmware/cmdline.txt" || true
+  echo "-- firmware blobs (sample) --"
+  ls -1 "$TGT_ROOT/boot/firmware"/start*.elf "$TGT_ROOT/boot/firmware"/fixup*.dat | head -n 8 || true
+  echo "-- Pi5 DTB --"
+  ls -l "$TGT_ROOT/boot/firmware/"*rpi-5-b.dtb || true
+}
+
+enroll_fido_tokens(){
+  echo "Enrolling FIDO2 Solo keys into $(part 2) ..."
+  need systemd-cryptenroll
+  need fido2-token
+
+  # Collect all hidraw paths from both output styles (some distros print 'Device: /dev/hidrawX')
+  mapfile -t DEVS < <(
+    fido2-token -L \
+      | sed -n 's,^\(/dev/hidraw[0-9]\+\):.*,\1,p; s,^Device:[[:space:]]\+/dev/hidraw\([0-9]\+\).*,/dev/hidraw\1,p' \
+      | sort -u
+  )
+
+  if (( ${#DEVS[@]} == 0 )); then
+    echo "No FIDO2 tokens detected; skipping enrollment (you can enroll later)."
+    echo "Example later: systemd-cryptenroll $(part 2) --fido2-device=/dev/hidrawX --fido2-with-client-pin=no"
+    return 0
+  fi
+
+  # Recommend keeping exactly ONE key plugged during first enrollment to avoid ambiguity.
+  if (( ${#DEVS[@]} > 1 )); then
+    echo "Note: multiple FIDO2 tokens present: ${DEVS[*]}"
+    echo "If enrollment fails, try with only one key inserted."
+  fi
+
+  local rc=0
+  for D in "${DEVS[@]}"; do
+    echo "-> Enrolling $D (you should be asked to touch the key)"
+    if ! SYSTEMD_LOG_LEVEL=debug systemd-cryptenroll "$(part 2)" \
+          --fido2-device="$D" \
+          --fido2-with-client-pin=no \
+          --fido2-with-user-presence=yes \
+          --fido2-with-user-verification=no \
+          --label="solo-$(basename "$D")"; then
+      echo "WARN: enrollment failed for $D"
+      rc=1
+    fi
+  done
+
+  echo "Tokens enrolled (if any):"
+  systemd-cryptenroll "$(part 2)" --list || true
+  return $rc
+}
+
+cleanup(){
+  rm -f "$TGT_ROOT/usr/sbin/policy-rc.d" || true
+  umount -lf "$TGT_ROOT/dev/pts" 2>/dev/null || true
+  for d in dev proc sys; do umount -lf "$TGT_ROOT/$d" 2>/dev/null || true; done
+  umount -lf "$TGT_ROOT/boot/firmware" 2>/dev/null || true
+  umount -lf "$TGT_BOOT" 2>/dev/null || true
+  umount -lf "$TGT_ROOT" 2>/dev/null || true
+  cryptsetup close cryptroot 2>/dev/null || true
+  umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true
+  umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true
+}
+
+main(){
+  require_root
+  need losetup; need parted; need rsync
+  auto_detect_target_disk
+  echo "Target disk: $NVME"
+  ensure_binfmt_aarch64
+  ensure_image
+  preflight_cleanup
+  guard_target_device
+  open_image
+  confirm_and_wipe
+  setup_luks
+  mount_targets
+  rsync_root_and_boot
+  write_crypttab_fstab
+  fix_firmware_files
+  seed_cloud_init
+  prep_chroot_mounts
+  in_chroot
+  verify_boot_assets
+  need_host_fido2
+  enroll_fido_tokens
+  cleanup
+  echo "✅ NVMe prepared."
+  echo "   Install in the Pi 5 and boot with no SD."
+  echo "   Expect LUKS to unlock automatically with a Solo key inserted;"
+  echo "   passphrase fallback remains. Hostname: ${STYX_HOSTNAME}  User: ${STYX_USER}"
+  echo "   On first boot, reach it via: ssh -i ~/.ssh/id_ed25519_titan styx@titan-ag.local"
+}
+
+main "$@"
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index b176c64..dc62ef5 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -71,8 +71,8 @@ spec:
 
       persistentVolume:
         enabled: true
-        size: 100Gi              # adjust; uses default StorageClass (Longhorn)
-        # storageClassName: ""   # set if you want a specific class
+        size: 100Gi
+        storageClassName: "astreae"
 
       # Enable built-in Kubernetes scraping
       scrape:
@@ -245,8 +245,8 @@ spec:
       GF_SECURITY_ALLOW_EMBEDDING: "true"
     grafana.ini:
       server:
-        domain: reporting.bstein.dev
-        root_url: https://reporting.bstein.dev/
+        domain: atlas.metrics.bstein.dev
+        root_url: https://atlas.metrics.bstein.dev/
       auth.anonymous:
         hide_version: true
       users:
@@ -322,14 +322,14 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - host: alerts.bstein.dev
+        - host: atlas.alerts.bstein.dev
           paths:
             - path: /
               pathType: Prefix
       tls:
         - secretName: alerts-bstein-dev-tls
           hosts:
-            - alerts.bstein.dev
+            - atlas.alerts.bstein.dev
     config:
       global:
         resolve_timeout: 5m

From d0b6fbe763b4c2489ebd44d370aeee05d8cc49fa Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sat, 15 Nov 2025 11:16:37 -0300
Subject: [PATCH 07/71] victoria-metrics: revert storageclass change

---
 services/monitoring/helmrelease.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index dc62ef5..3e5c78c 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -72,7 +72,6 @@ spec:
       persistentVolume:
         enabled: true
         size: 100Gi
-        storageClassName: "astreae"
 
       # Enable built-in Kubernetes scraping
       scrape:

From 683dc84289ad0f8b687ead0a1cf5ff8dbe8e11ca Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sat, 15 Nov 2025 11:18:40 -0300
Subject: [PATCH 08/71] grafana: use atlas metrics hostname

---
 services/monitoring/helmrelease.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 3e5c78c..1720af5 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -256,12 +256,12 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - reporting.bstein.dev
+        - atlas.metrics.bstein.dev
       path: /
       tls:
-        - secretName: grafana-reporting-tls
+        - secretName: grafana-atlas-metrics-tls
           hosts:
-            - reporting.bstein.dev
+            - atlas.metrics.bstein.dev
     datasources:
       datasources.yaml:
         apiVersion: 1

From 46b6b1f3b896eb8cd46d0ae8e33d5f00dc30ceae Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sat, 15 Nov 2025 11:35:27 -0300
Subject: [PATCH 09/71] grafana: set datasource uid

---
 services/monitoring/helmrelease.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 1720af5..266ddcd 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -273,6 +273,7 @@ spec:
             isDefault: true
             jsonData:
               timeInterval: "15s"
+            uid: atlas-vm
     dashboardProviders:
       dashboardproviders.yaml:
         apiVersion: 1

From eb3991b6283bfb606f094778fec437f0daef6203 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sat, 15 Nov 2025 11:59:48 -0300
Subject: [PATCH 10/71] dashboards: improve public view and fix color

---
 .../monitoring/grafana-dashboard-public.yaml  | 115 +++++++++++++++---
 .../monitoring/grafana-dashboard-sre.yaml     |   2 +-
 2 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml
index db5d6c1..aee871f 100644
--- a/services/monitoring/grafana-dashboard-public.yaml
+++ b/services/monitoring/grafana-dashboard-public.yaml
@@ -177,31 +177,114 @@ data:
             "y": 7
           },
           "id": 3,
-          "options": {
-            "legend": {
-              "calcs": [],
-              "displayMode": "list",
-              "placement": "bottom",
-              "showLegend": true
-            },
-            "tooltip": {
-              "mode": "single",
-              "sort": "none"
-            }
-          },
           "targets": [
             {
               "datasource": {
                 "type": "prometheus",
                 "uid": "atlas-vm"
               },
-              "expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)",
+              "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) by (namespace)",
               "legendFormat": "{{namespace}}",
               "refId": "A"
             }
           ],
-          "title": "Namespace CPU (5m avg)",
-          "type": "timeseries"
+          "title": "Running pods per namespace",
+          "type": "bargauge",
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": ["lastNotNull"],
+              "fields": "",
+              "values": false
+            },
+            "showUnfilled": false
+          }
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 15
+          },
+          "id": 4,
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+              "legendFormat": "Ready",
+              "refId": "A"
+            },
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})",
+              "legendFormat": "Not Ready",
+              "refId": "B"
+            }
+          ],
+          "title": "Node readiness",
+          "type": "piechart",
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "pieType": "donut"
+          }
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 23
+          },
+          "id": 5,
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "atlas-vm"
+              },
+              "expr": "sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[1d]))",
+              "legendFormat": "{{namespace}}",
+              "refId": "A"
+            }
+          ],
+          "title": "Failed pods (24h)",
+          "type": "table",
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {"color": "green", "value": null},
+                  {"color": "red", "value": 1}
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          }
         }
       ],
       "refresh": "30s",
@@ -215,7 +298,7 @@ data:
         "list": []
       },
       "time": {
-        "from": "now-6h",
+        "from": "now-12h",
         "to": "now"
       },
       "timepicker": {},
diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml
index 12995af..d146275 100644
--- a/services/monitoring/grafana-dashboard-sre.yaml
+++ b/services/monitoring/grafana-dashboard-sre.yaml
@@ -38,7 +38,7 @@ data:
           "fieldConfig": {
             "defaults": {
               "color": {
-                "mode": "continuous"
+                "mode": "continuous-RdYlGr"
               },
               "mappings": [],
               "max": 100,

From 0b1437b77c93c5a0ac411983d080938e3030af8c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sat, 15 Nov 2025 21:03:11 -0300
Subject: [PATCH 11/71] monitoring: refresh grafana dashboards

---
 .../monitoring/grafana-dashboard-public.yaml  | 545 ++++++++++++++----
 .../monitoring/grafana-dashboard-sre.yaml     | 527 ++++++++++++++---
 services/monitoring/grafana-folders.yaml      |  28 +
 services/monitoring/helmrelease.yaml          |   5 +
 services/monitoring/kustomization.yaml        |   1 +
 5 files changed, 903 insertions(+), 203 deletions(-)
 create mode 100644 services/monitoring/grafana-folders.yaml

diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml
index aee871f..126b1b3 100644
--- a/services/monitoring/grafana-dashboard-public.yaml
+++ b/services/monitoring/grafana-dashboard-public.yaml
@@ -25,17 +25,30 @@ data:
         ]
       },
       "editable": false,
-      "fiscalYearStartMonth": 0,
+      "folderUid": "atlas-public",
       "graphTooltip": 0,
-      "id": null,
       "links": [],
-      "liveNow": false,
       "panels": [
         {
+          "id": 1,
+          "type": "stat",
+          "title": "Running pods",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
+              "refId": "A"
+            }
+          ],
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -46,8 +59,12 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
                   }
                 ]
               },
@@ -55,59 +72,105 @@ data:
             },
             "overrides": []
           },
-          "gridPos": {
-            "h": 7,
-            "w": 6,
-            "x": 0,
-            "y": 0
-          },
-          "id": 1,
           "options": {
             "colorMode": "value",
             "graphMode": "area",
-            "justifyMode": "auto",
-            "orientation": "auto",
+            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
               ],
               "fields": "",
               "values": false
-            },
-            "text": {},
-            "textMode": "auto"
-          },
-          "pluginVersion": "10.4.0",
-          "targets": [
-            {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "editorMode": "code",
-              "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
-              "legendFormat": "",
-              "range": true,
-              "refId": "A"
             }
-          ],
-          "title": "Running pods",
-          "type": "stat"
+          }
         },
         {
+          "id": 2,
+          "type": "stat",
+          "title": "Ready node percentage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
-          "description": "Aggregated CPU usage across all schedulable nodes.",
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 6,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100",
+              "refId": "A"
+            }
+          ],
           "fieldConfig": {
             "defaults": {
               "color": {
-                "mode": "continuous-BlYlRd"
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 90
+                  },
+                  {
+                    "color": "green",
+                    "value": 98
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 3,
+          "type": "stat",
+          "title": "Cluster CPU saturation",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 12,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
               },
               "mappings": [],
-              "max": 100,
-              "min": 0,
               "thresholds": {
                 "mode": "percentage",
                 "steps": [
@@ -117,7 +180,7 @@ data:
                   },
                   {
                     "color": "yellow",
-                    "value": 60
+                    "value": 65
                   },
                   {
                     "color": "red",
@@ -129,79 +192,165 @@ data:
             },
             "overrides": []
           },
-          "gridPos": {
-            "h": 7,
-            "w": 6,
-            "x": 6,
-            "y": 0
-          },
-          "id": 2,
           "options": {
             "colorMode": "value",
             "graphMode": "area",
-            "justifyMode": "auto",
-            "orientation": "auto",
+            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
               ],
               "fields": "",
               "values": false
-            },
-            "text": {},
-            "textMode": "auto"
-          },
-          "targets": [
-            {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))",
-              "legendFormat": "",
-              "refId": "A"
             }
-          ],
-          "title": "Average node CPU",
-          "type": "stat"
+          }
         },
         {
+          "id": 4,
+          "type": "stat",
+          "title": "Cluster memory usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 7
+            "h": 6,
+            "w": 6,
+            "x": 18,
+            "y": 0
           },
-          "id": 3,
           "targets": [
             {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) by (namespace)",
-              "legendFormat": "{{namespace}}",
+              "expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)",
               "refId": "A"
             }
           ],
-          "title": "Running pods per namespace",
-          "type": "bargauge",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
           "options": {
-            "displayMode": "gradient",
-            "orientation": "horizontal",
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
             "reduceOptions": {
-              "calcs": ["lastNotNull"],
+              "calcs": [
+                "lastNotNull"
+              ],
               "fields": "",
               "values": false
-            },
-            "showUnfilled": false
+            }
           }
         },
         {
+          "id": 5,
+          "type": "piechart",
+          "title": "Namespace CPU share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 6
+          },
+          "targets": [
+            {
+              "expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "cores"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 6,
+          "type": "piechart",
+          "title": "Namespace memory share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 6
+          },
+          "targets": [
+            {
+              "expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "donut",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 7,
+          "type": "timeseries",
+          "title": "Node CPU usage (per node)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -212,38 +361,70 @@ data:
             "x": 0,
             "y": 15
           },
-          "id": 4,
           "targets": [
             {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
-              "legendFormat": "Ready",
-              "refId": "A"
-            },
-            {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})",
-              "legendFormat": "Not Ready",
-              "refId": "B"
+              "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100",
+              "refId": "A",
+              "legendFormat": "{{instance}}"
             }
           ],
-          "title": "Node readiness",
-          "type": "piechart",
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "right"
+              "placement": "bottom"
             },
-            "pieType": "donut"
+            "tooltip": {
+              "mode": "multi"
+            }
           }
         },
         {
+          "id": 8,
+          "type": "timeseries",
+          "title": "Node memory usage (per node)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 15
+          },
+          "targets": [
+            {
+              "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)",
+              "refId": "A",
+              "legendFormat": "{{instance}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 9,
+          "type": "table",
+          "title": "Key service availability",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -254,29 +435,39 @@ data:
             "x": 0,
             "y": 23
           },
-          "id": 5,
           "targets": [
             {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[1d]))",
-              "legendFormat": "{{namespace}}",
+              "expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})",
               "refId": "A"
             }
           ],
-          "title": "Failed pods (24h)",
-          "type": "table",
           "fieldConfig": {
             "defaults": {
-              "unit": "none",
-              "mappings": [],
+              "mappings": [
+                {
+                  "id": 0,
+                  "type": 1,
+                  "value": "0",
+                  "text": "Down"
+                },
+                {
+                  "id": 1,
+                  "type": 1,
+                  "value": "1",
+                  "text": "Up"
+                }
+              ],
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
-                  {"color": "green", "value": null},
-                  {"color": "red", "value": 1}
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
                 ]
               }
             },
@@ -285,6 +476,126 @@ data:
           "options": {
             "showHeader": true
           }
+        },
+        {
+          "id": 10,
+          "type": "table",
+          "title": "Failed pods (24h trend)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 23
+          },
+          "targets": [
+            {
+              "expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          }
+        },
+        {
+          "id": 11,
+          "type": "timeseries",
+          "title": "Cluster network throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 31
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))",
+              "refId": "A",
+              "legendFormat": "Receive"
+            },
+            {
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))",
+              "refId": "B",
+              "legendFormat": "Transmit"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "Bps"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 12,
+          "type": "timeseries",
+          "title": "Storage usage across nodes",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 31
+          },
+          "targets": [
+            {
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            }
+          }
+        },
+        {
+          "id": 13,
+          "type": "text",
+          "title": "About this dashboard",
+          "gridPos": {
+            "h": 6,
+            "w": 24,
+            "x": 0,
+            "y": 39
+          },
+          "options": {
+            "content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health",
+            "mode": "markdown"
+          }
         }
       ],
       "refresh": "30s",
@@ -301,10 +612,8 @@ data:
         "from": "now-12h",
         "to": "now"
       },
-      "timepicker": {},
-      "timezone": "",
       "title": "Atlas Public Overview",
       "uid": "atlas-public",
-      "version": 1,
-      "weekStart": ""
+      "version": 3
     }
+
diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml
index d146275..b46c17a 100644
--- a/services/monitoring/grafana-dashboard-sre.yaml
+++ b/services/monitoring/grafana-dashboard-sre.yaml
@@ -20,29 +20,41 @@ data:
             "hide": true,
             "iconColor": "rgba(0, 211, 255, 1)",
             "name": "Annotations & Alerts",
-              "type": "dashboard"
+            "type": "dashboard"
           }
         ]
       },
       "editable": true,
-      "fiscalYearStartMonth": 0,
+      "folderUid": "atlas-sre",
       "graphTooltip": 0,
       "links": [],
       "panels": [
         {
+          "id": 1,
+          "type": "stat",
+          "title": "Ready nodes",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
-          "description": "Percentage of Ready nodes.",
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100",
+              "refId": "A"
+            }
+          ],
           "fieldConfig": {
             "defaults": {
               "color": {
-                "mode": "continuous-RdYlGr"
+                "mode": "palette-classic"
               },
               "mappings": [],
-              "max": 100,
-              "min": 0,
               "thresholds": {
                 "mode": "percentage",
                 "steps": [
@@ -50,9 +62,13 @@ data:
                     "color": "red",
                     "value": null
                   },
+                  {
+                    "color": "yellow",
+                    "value": 95
+                  },
                   {
                     "color": "green",
-                    "value": 90
+                    "value": 99
                   }
                 ]
               },
@@ -60,18 +76,10 @@ data:
             },
             "overrides": []
           },
-          "gridPos": {
-            "h": 7,
-            "w": 6,
-            "x": 0,
-            "y": 0
-          },
-          "id": 10,
           "options": {
             "colorMode": "value",
-            "graphMode": "none",
+            "graphMode": "area",
             "justifyMode": "center",
-            "orientation": "auto",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -79,92 +87,192 @@ data:
               "fields": "",
               "values": false
             }
-          },
-          "targets": [
-            {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100",
-              "refId": "A"
-            }
-          ],
-          "title": "Ready nodes",
-          "type": "stat"
+          }
         },
         {
+          "id": 2,
+          "type": "stat",
+          "title": "Pending pods",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 5,
             "w": 6,
             "x": 6,
             "y": 0
           },
-          "id": 11,
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
           "targets": [
             {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})",
-              "legendFormat": "{{node}}",
+              "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})",
               "refId": "A"
             }
           ],
-          "title": "Free root filesystem bytes",
-          "type": "timeseries"
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 10
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
         },
         {
+          "id": 3,
+          "type": "stat",
+          "title": "Unavailable deployment replicas",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 10,
-            "w": 12,
-            "x": 0,
-            "y": 7
-          },
-          "id": 12,
-          "options": {
-            "legend": {
-              "calcs": [],
-              "displayMode": "list",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "single"
-            }
+            "h": 5,
+            "w": 6,
+            "x": 12,
+            "y": 0
           },
           "targets": [
             {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)",
-              "legendFormat": "{{pod}}",
+              "expr": "sum(kube_deployment_status_replicas_unavailable)",
               "refId": "A"
             }
           ],
-          "title": "Crypto namespace CPU usage",
-          "type": "timeseries"
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
         },
         {
+          "id": 4,
+          "type": "stat",
+          "title": "Active alerts",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 18,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(ALERTS{alertstate=\"firing\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 5,
+          "type": "timeseries",
+          "title": "Node CPU usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -173,9 +281,168 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 17
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100",
+              "refId": "A",
+              "legendFormat": "{{instance}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 6,
+          "type": "timeseries",
+          "title": "Node memory usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)",
+              "refId": "A",
+              "legendFormat": "{{instance}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 7,
+          "type": "timeseries",
+          "title": "Top pod CPU (5m avg)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{pod}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "cores"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 8,
+          "type": "timeseries",
+          "title": "Top pod memory working set",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{pod}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 9,
+          "type": "bargauge",
+          "title": "Namespace restart rate (6h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 23
+          },
+          "targets": [
+            {
+              "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
           },
-          "id": 13,
           "options": {
             "displayMode": "gradient",
             "orientation": "horizontal",
@@ -185,22 +452,112 @@ data:
               ],
               "fields": "",
               "values": false
-            },
-            "showUnfilled": false
+            }
+          }
+        },
+        {
+          "id": 10,
+          "type": "table",
+          "title": "Deployments missing replicas",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 23
           },
           "targets": [
             {
-              "datasource": {
-                "type": "prometheus",
-                "uid": "atlas-vm"
-              },
-              "expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))",
-              "legendFormat": "",
+              "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))",
               "refId": "A"
             }
           ],
-          "title": "Namespaces with failed pods",
-          "type": "bargauge"
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          }
+        },
+        {
+          "id": 11,
+          "type": "timeseries",
+          "title": "Pod phase breakdown",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 31
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_pod_status_phase) by (phase)",
+              "refId": "A",
+              "legendFormat": "{{phase}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 12,
+          "type": "timeseries",
+          "title": "PVC usage (top 8)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 31
+          },
+          "targets": [
+            {
+              "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
         }
       ],
       "schemaVersion": 39,
@@ -216,8 +573,8 @@ data:
         "from": "now-12h",
         "to": "now"
       },
-      "timepicker": {},
       "title": "Atlas SRE Overview",
       "uid": "atlas-sre",
-      "version": 1
+      "version": 2
     }
+
diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml
new file mode 100644
index 0000000..503aaee
--- /dev/null
+++ b/services/monitoring/grafana-folders.yaml
@@ -0,0 +1,28 @@
+# services/monitoring/grafana-folders.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-folders
+  labels:
+    app.kubernetes.io/name: grafana
+    app.kubernetes.io/component: folders
+data:
+  folders.yaml: |
+    apiVersion: 1
+    folders:
+      - uid: atlas-public
+        title: Atlas Public
+        permissions:
+          - role: Viewer
+            permission: View
+          - role: Editor
+            permission: Edit
+          - role: Admin
+            permission: Admin
+      - uid: atlas-sre
+        title: Atlas SRE
+        permissions:
+          - role: Editor
+            permission: View
+          - role: Admin
+            permission: Admin
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 266ddcd..4efae70 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -297,6 +297,11 @@ spec:
     dashboardsConfigMaps:
       public: grafana-dashboard-public
       sre: grafana-dashboard-sre
+    extraConfigmapMounts:
+      - name: grafana-folders
+        mountPath: /etc/grafana/provisioning/folders
+        configMap: grafana-folders
+        readOnly: true
 
 ---
 
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index bb321b5..73e7d23 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -7,4 +7,5 @@ resources:
   - rbac.yaml
   - grafana-dashboard-public.yaml
   - grafana-dashboard-sre.yaml
+  - grafana-folders.yaml
   - helmrelease.yaml

From b004bf99dc88294f15ec38b66f39b7f01a2435c0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sun, 16 Nov 2025 00:55:28 -0300
Subject: [PATCH 12/71] monitoring: enrich dashboards

---
 .../monitoring/grafana-dashboard-public.yaml  | 648 +++++++++++++++---
 .../monitoring/grafana-dashboard-sre.yaml     |  25 +-
 2 files changed, 551 insertions(+), 122 deletions(-)

diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml
index 126b1b3..35fa124 100644
--- a/services/monitoring/grafana-dashboard-public.yaml
+++ b/services/monitoring/grafana-dashboard-public.yaml
@@ -38,8 +38,8 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
-            "w": 6,
+            "h": 5,
+            "w": 4,
             "x": 0,
             "y": 0
           },
@@ -82,26 +82,27 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
           "id": 2,
           "type": "stat",
-          "title": "Ready node percentage",
+          "title": "Ready nodes",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
-            "w": 6,
-            "x": 6,
+            "h": 5,
+            "w": 4,
+            "x": 4,
             "y": 0
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
               "refId": "A"
             }
           ],
@@ -112,23 +113,19 @@ data:
               },
               "mappings": [],
               "thresholds": {
-                "mode": "percentage",
+                "mode": "absolute",
                 "steps": [
                   {
-                    "color": "red",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
-                  {
-                    "color": "orange",
-                    "value": 90
-                  },
                   {
                     "color": "green",
-                    "value": 98
+                    "value": 1
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "none"
             },
             "overrides": []
           },
@@ -142,26 +139,27 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
           "id": 3,
           "type": "stat",
-          "title": "Cluster CPU saturation",
+          "title": "Cluster nodes",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
-            "w": 6,
-            "x": 12,
+            "h": 5,
+            "w": 4,
+            "x": 8,
             "y": 0
           },
           "targets": [
             {
-              "expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
+              "expr": "count(kube_node_info)",
               "refId": "A"
             }
           ],
@@ -172,23 +170,19 @@ data:
               },
               "mappings": [],
               "thresholds": {
-                "mode": "percentage",
+                "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
                   {
-                    "color": "yellow",
-                    "value": 65
-                  },
-                  {
-                    "color": "red",
-                    "value": 85
+                    "color": "green",
+                    "value": 1
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "none"
             },
             "overrides": []
           },
@@ -202,26 +196,27 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
           "id": 4,
           "type": "stat",
-          "title": "Cluster memory usage",
+          "title": "Hottest node CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
-            "w": 6,
-            "x": 18,
+            "h": 5,
+            "w": 4,
+            "x": 12,
             "y": 0
           },
           "targets": [
             {
-              "expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)",
+              "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))",
               "refId": "A"
             }
           ],
@@ -262,11 +257,134 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value_and_name"
           }
         },
         {
           "id": 5,
+          "type": "stat",
+          "title": "Hottest node memory",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 4,
+            "x": 16,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 75
+                  },
+                  {
+                    "color": "red",
+                    "value": 90
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value_and_name"
+          }
+        },
+        {
+          "id": 6,
+          "type": "stat",
+          "title": "Failed pods (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 4,
+            "x": 20,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 7,
           "type": "piechart",
           "title": "Namespace CPU share",
           "datasource": {
@@ -277,11 +395,11 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 6
+            "y": 5
           },
           "targets": [
             {
-              "expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
+              "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
               "refId": "A"
             }
           ],
@@ -307,7 +425,7 @@ data:
           }
         },
         {
-          "id": 6,
+          "id": 8,
           "type": "piechart",
           "title": "Namespace memory share",
           "datasource": {
@@ -318,11 +436,11 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 6
+            "y": 5
           },
           "targets": [
             {
-              "expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
+              "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
               "refId": "A"
             }
           ],
@@ -348,7 +466,7 @@ data:
           }
         },
         {
-          "id": 7,
+          "id": 9,
           "type": "timeseries",
           "title": "Node CPU usage (per node)",
           "datasource": {
@@ -359,13 +477,13 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 15
+            "y": 14
           },
           "targets": [
             {
-              "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100",
+              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
               "refId": "A",
-              "legendFormat": "{{instance}}"
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -385,7 +503,7 @@ data:
           }
         },
         {
-          "id": 8,
+          "id": 10,
           "type": "timeseries",
           "title": "Node memory usage (per node)",
           "datasource": {
@@ -396,13 +514,13 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 15
+            "y": 14
           },
           "targets": [
             {
-              "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)",
+              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
               "refId": "A",
-              "legendFormat": "{{instance}}"
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -422,7 +540,7 @@ data:
           }
         },
         {
-          "id": 9,
+          "id": 11,
           "type": "table",
           "title": "Key service availability",
           "datasource": {
@@ -430,46 +548,23 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 7,
             "w": 12,
             "x": 0,
-            "y": 23
+            "y": 22
           },
           "targets": [
             {
-              "expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})",
+              "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"})), \"service\", \"$1\", \"deployment\", \"(.*)\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"})), \"service\", \"$1\", \"statefulset\", \"(.*)\")",
               "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "mappings": [
-                {
-                  "id": 0,
-                  "type": 1,
-                  "value": "0",
-                  "text": "Down"
-                },
-                {
-                  "id": 1,
-                  "type": 1,
-                  "value": "1",
-                  "text": "Up"
-                }
-              ],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "red",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              }
+              "custom": {
+                "align": "auto"
+              },
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -478,22 +573,22 @@ data:
           }
         },
         {
-          "id": 10,
+          "id": 12,
           "type": "table",
-          "title": "Failed pods (24h trend)",
+          "title": "Failed pods by namespace (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 7,
             "w": 12,
             "x": 12,
-            "y": 23
+            "y": 22
           },
           "targets": [
             {
-              "expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))",
+              "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[24h])))",
               "refId": "A"
             }
           ],
@@ -508,9 +603,9 @@ data:
           }
         },
         {
-          "id": 11,
+          "id": 13,
           "type": "timeseries",
-          "title": "Cluster network throughput",
+          "title": "Root filesystem usage per node",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -519,23 +614,18 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 31
+            "y": 29
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))",
+              "expr": "avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
               "refId": "A",
-              "legendFormat": "Receive"
-            },
-            {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))",
-              "refId": "B",
-              "legendFormat": "Transmit"
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "Bps"
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -550,9 +640,9 @@ data:
           }
         },
         {
-          "id": 12,
-          "type": "timeseries",
-          "title": "Storage usage across nodes",
+          "id": 14,
+          "type": "bargauge",
+          "title": "Nodes closest to full root disks",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -561,40 +651,377 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 31
+            "y": 29
           },
           "targets": [
             {
-              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)",
+              "expr": "topk(8, avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))",
               "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 15,
+          "type": "stat",
+          "title": "Astreae usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 0,
+            "y": 37
+          },
+          "targets": [
+            {
+              "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"})) * 100",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              },
               "unit": "percent"
             },
             "overrides": []
           },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 16,
+          "type": "stat",
+          "title": "Asteria usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 6,
+            "y": 37
+          },
+          "targets": [
+            {
+              "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"})) * 100",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 17,
+          "type": "stat",
+          "title": "Astreae schedulable",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 12,
+            "y": 37
+          },
+          "targets": [
+            {
+              "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"astreae-.*\"}))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "bytesSI"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 18,
+          "type": "stat",
+          "title": "Asteria schedulable",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 18,
+            "y": 37
+          },
+          "targets": [
+            {
+              "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"asteria-.*\"}))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "bytesSI"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 19,
+          "type": "piechart",
+          "title": "Longhorn node readiness",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 44
+          },
+          "targets": [
+            {
+              "expr": "sum(longhorn_node_status{condition=\"ready\"})",
+              "refId": "A",
+              "legendFormat": "Ready"
+            },
+            {
+              "expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\"ready\"}))",
+              "refId": "B",
+              "legendFormat": "Offline"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
           "options": {
             "legend": {
               "displayMode": "list",
-              "placement": "bottom"
+              "placement": "right"
+            },
+            "pieType": "donut",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
             }
           }
         },
         {
-          "id": 13,
+          "id": 20,
+          "type": "piechart",
+          "title": "Longhorn disk schedulability",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 44
+          },
+          "targets": [
+            {
+              "expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"}))",
+              "refId": "A",
+              "legendFormat": "Schedulable"
+            },
+            {
+              "expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\"ready\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"})))",
+              "refId": "B",
+              "legendFormat": "Blocked"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "donut",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 21,
           "type": "text",
           "title": "About this dashboard",
           "gridPos": {
-            "h": 6,
+            "h": 5,
             "w": 24,
             "x": 0,
-            "y": 39
+            "y": 51
           },
           "options": {
-            "content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health",
-            "mode": "markdown"
+            "mode": "markdown",
+            "content": "### Atlas at a glance\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\n- Login for the SRE view with alert routing, Longhorn drilldowns, and workload burn rates"
           }
         }
       ],
@@ -614,6 +1041,5 @@ data:
       },
       "title": "Atlas Public Overview",
       "uid": "atlas-public",
-      "version": 3
+      "version": 5
     }
-
diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml
index b46c17a..d5d8dca 100644
--- a/services/monitoring/grafana-dashboard-sre.yaml
+++ b/services/monitoring/grafana-dashboard-sre.yaml
@@ -45,7 +45,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100",
               "refId": "A"
             }
           ],
@@ -86,7 +86,8 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
@@ -146,7 +147,8 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
@@ -206,7 +208,8 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
@@ -266,7 +269,8 @@ data:
               ],
               "fields": "",
               "values": false
-            }
+            },
+            "textMode": "value"
           }
         },
         {
@@ -285,9 +289,9 @@ data:
           },
           "targets": [
             {
-              "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100",
+              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
               "refId": "A",
-              "legendFormat": "{{instance}}"
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -322,9 +326,9 @@ data:
           },
           "targets": [
             {
-              "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)",
+              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
               "refId": "A",
-              "legendFormat": "{{instance}}"
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -575,6 +579,5 @@ data:
       },
       "title": "Atlas SRE Overview",
       "uid": "atlas-sre",
-      "version": 2
+      "version": 4
     }
-

From a41f25e66d7bbc02ea3fb287920f4eb4bfda686d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 14:22:46 -0300
Subject: [PATCH 13/71] monitoring: restructure grafana dashboards

---
 scripts/render_dashboards.py                  |  605 ++++++++
 .../monitoring/dashboards/atlas-nodes.json    |  369 +++++
 .../monitoring/dashboards/atlas-overview.json | 1270 +++++++++++++++++
 .../monitoring/dashboards/atlas-pods.json     |  137 ++
 .../monitoring/dashboards/atlas-storage.json  |  359 +++++
 ...-sre.yaml => grafana-dashboard-nodes.yaml} |  331 +----
 ...c.yaml => grafana-dashboard-overview.yaml} |  716 ++++++----
 .../monitoring/grafana-dashboard-pods.yaml    |  146 ++
 .../monitoring/grafana-dashboard-storage.yaml |  368 +++++
 services/monitoring/grafana-folders.yaml      |   22 +-
 services/monitoring/helmrelease.yaml          |   48 +-
 services/monitoring/kustomization.yaml        |    6 +-
 12 files changed, 3847 insertions(+), 530 deletions(-)
 create mode 100755 scripts/render_dashboards.py
 create mode 100644 services/monitoring/dashboards/atlas-nodes.json
 create mode 100644 services/monitoring/dashboards/atlas-overview.json
 create mode 100644 services/monitoring/dashboards/atlas-pods.json
 create mode 100644 services/monitoring/dashboards/atlas-storage.json
 rename services/monitoring/{grafana-dashboard-sre.yaml => grafana-dashboard-nodes.yaml} (53%)
 rename services/monitoring/{grafana-dashboard-public.yaml => grafana-dashboard-overview.yaml} (67%)
 create mode 100644 services/monitoring/grafana-dashboard-pods.yaml
 create mode 100644 services/monitoring/grafana-dashboard-storage.yaml

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
new file mode 100755
index 0000000..fa9ef58
--- /dev/null
+++ b/scripts/render_dashboards.py
@@ -0,0 +1,605 @@
+#!/usr/bin/env python3
+"""Generate Grafana dashboards and render them into ConfigMaps.
+
+Usage:
+    python scripts/render_dashboards.py --build   # rebuild JSON + ConfigMaps
+    python scripts/render_dashboards.py           # just render ConfigMaps
+"""
+import argparse
+import json
+import textwrap
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
+CONFIG_TEMPLATE = textwrap.dedent(
+    """# {relative_path}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {name}
+  labels:
+    grafana_dashboard: "1"
+data:
+  {key}: |
+{payload}
+"""
+)
+
+PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
+
+
+# --------------------------------------------------------------------------- #
+# Panel helper factories
+# --------------------------------------------------------------------------- #
+
+
+def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
+               text_mode="value", legend=None):
+    defaults = {
+        "color": {"mode": "palette-classic"},
+        "mappings": [],
+        "thresholds": thresholds
+        or {
+            "mode": "absolute",
+            "steps": [
+                {"color": "rgba(115, 115, 115, 1)", "value": None},
+                {"color": "green", "value": 1},
+            ],
+        },
+        "unit": unit,
+    }
+    panel = {
+        "id": panel_id,
+        "type": "stat",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A"}],
+        "fieldConfig": {"defaults": defaults, "overrides": []},
+        "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
+            "textMode": text_mode,
+        },
+    }
+    if legend:
+        panel["targets"][0]["legendFormat"] = legend
+    return panel
+
+
+def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None,
+                     legend_display="table", legend_placement="bottom",
+                     legend_calcs=None, time_from=None):
+    panel = {
+        "id": panel_id,
+        "type": "timeseries",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A"}],
+        "fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
+        "options": {
+            "legend": {
+                "displayMode": legend_display,
+                "placement": legend_placement,
+            },
+            "tooltip": {"mode": "multi"},
+        },
+    }
+    if legend:
+        panel["targets"][0]["legendFormat"] = legend
+    if legend_calcs:
+        panel["options"]["legend"]["calcs"] = legend_calcs
+    if time_from:
+        panel["timeFrom"] = time_from
+    return panel
+
+
+def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None,
+                description=None):
+    panel = {
+        "id": panel_id,
+        "type": "table",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A"}],
+        "fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
+        "options": {"showHeader": True},
+    }
+    if transformations:
+        panel["transformations"] = transformations
+    if description:
+        panel["description"] = description
+    return panel
+
+
+def pie_panel(panel_id, title, expr, grid):
+    return {
+        "id": panel_id,
+        "type": "piechart",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
+        "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
+        "options": {
+            "legend": {"displayMode": "list", "placement": "right"},
+            "pieType": "pie",
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
+        },
+    }
+
+
+def text_panel(panel_id, title, content, grid):
+    return {
+        "id": panel_id,
+        "type": "text",
+        "title": title,
+        "gridPos": grid,
+        "datasource": None,
+        "options": {"mode": "markdown", "content": content},
+    }
+
+
+def node_cpu_expr(scope=""):
+    expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))"
+    if scope:
+        expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
+    return expr
+
+
+def node_mem_expr(scope=""):
+    expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))"
+    if scope:
+        expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
+    return expr
+
+
+def root_usage_expr():
+    return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)"
+
+
+def astreae_usage_expr(mount):
+    return (
+        f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
+        f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
+    )
+
+
+def astreae_free_expr(mount):
+    return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
+
+
+def build_overview():
+    thresholds_percent = {
+        "mode": "percentage",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 70},
+            {"color": "red", "value": 85},
+        ],
+    }
+    panels = []
+    stats = [
+        (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'),
+        (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'),
+        (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'),
+        (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'),
+        (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'),
+        (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'),
+    ]
+    for idx, (panel_id, title, expr) in enumerate(stats):
+        panels.append(
+            stat_panel(
+                panel_id,
+                title,
+                expr,
+                {"h": 5, "w": 4, "x": 4 * idx, "y": 0},
+            )
+        )
+    panels.append(
+        stat_panel(
+            7,
+            "Hottest node: CPU",
+            node_cpu_expr(),
+            {"h": 5, "w": 4, "x": 24, "y": 0},
+            unit="percent",
+            thresholds=thresholds_percent,
+            text_mode="value_and_name",
+            legend="{{node}}",
+        )
+    )
+    panels.append(
+        stat_panel(
+            8,
+            "Hottest node: RAM",
+            node_mem_expr(),
+            {"h": 5, "w": 4, "x": 28, "y": 0},
+            unit="percent",
+            thresholds=thresholds_percent,
+            text_mode="value_and_name",
+            legend="{{node}}",
+        )
+    )
+
+    panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5}))
+    panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5}))
+
+    panels.append(
+        timeseries_panel(
+            11,
+            "Cluster node CPU",
+            node_cpu_expr(),
+            {"h": 8, "w": 12, "x": 0, "y": 14},
+            unit="percent",
+            legend="{{node}}",
+            legend_calcs=["last"],
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            12,
+            "Cluster node RAM",
+            node_mem_expr(),
+            {"h": 8, "w": 12, "x": 12, "y": 14},
+            unit="percent",
+            legend="{{node}}",
+            legend_calcs=["last"],
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+
+    panels.append(
+        table_panel(
+            13,
+            "Problem pods (details)",
+            "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+            {"h": 8, "w": 12, "x": 0, "y": 22},
+            unit="s",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        table_panel(
+            14,
+            "Terminating >10m",
+            "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+            {"h": 8, "w": 12, "x": 12, "y": 22},
+            unit="s",
+            transformations=[
+                {"id": "labelsToFields", "options": {}} ,
+                {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
+            ],
+        )
+    )
+
+    panels.append(
+        timeseries_panel(
+            15,
+            "Control plane CPU",
+            node_cpu_expr("titan-0a|titan-0b|titan-0c"),
+            {"h": 7, "w": 12, "x": 0, "y": 30},
+            unit="percent",
+            legend="{{node}}",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            16,
+            "Control plane RAM",
+            node_mem_expr("titan-0a|titan-0b|titan-0c"),
+            {"h": 7, "w": 12, "x": 12, "y": 30},
+            unit="percent",
+            legend="{{node}}",
+        )
+    )
+
+    panels.append(
+        timeseries_panel(
+            17,
+            "Root filesystem usage",
+            root_usage_expr(),
+            {"h": 8, "w": 12, "x": 0, "y": 37},
+            unit="percent",
+            legend="{{node}}",
+            legend_calcs=["last"],
+            legend_display="table",
+            legend_placement="right",
+            time_from="7d",
+        )
+    )
+
+    panels.append(
+        {
+            "id": 18,
+            "type": "bargauge",
+            "title": "Nodes closest to full root disks",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
+            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
+            "fieldConfig": {
+                "defaults": {
+                    "unit": "percent",
+                    "min": 0,
+                    "max": 100,
+                    "thresholds": {
+                        "mode": "percentage",
+                        "steps": [
+                            {"color": "green", "value": None},
+                            {"color": "yellow", "value": 50},
+                            {"color": "orange", "value": 70},
+                            {"color": "red", "value": 85},
+                        ],
+                    },
+                },
+                "overrides": [],
+            },
+            "options": {
+                "displayMode": "gradient",
+                "orientation": "horizontal",
+                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
+            },
+        }
+    )
+
+    panels.append(
+        stat_panel(
+            19,
+            "Astreae usage",
+            astreae_usage_expr("/mnt/astreae"),
+            {"h": 6, "w": 6, "x": 0, "y": 45},
+            unit="percent",
+            thresholds=thresholds_percent,
+        )
+    )
+    panels.append(
+        stat_panel(
+            20,
+            "Asteria usage",
+            astreae_usage_expr("/mnt/asteria"),
+            {"h": 6, "w": 6, "x": 6, "y": 45},
+            unit="percent",
+            thresholds=thresholds_percent,
+        )
+    )
+    panels.append(
+        stat_panel(
+            21,
+            "Astreae free",
+            astreae_free_expr("/mnt/astreae"),
+            {"h": 6, "w": 6, "x": 12, "y": 45},
+            unit="bytesSI",
+        )
+    )
+    panels.append(
+        stat_panel(
+            22,
+            "Asteria free",
+            astreae_free_expr("/mnt/asteria"),
+            {"h": 6, "w": 6, "x": 18, "y": 45},
+            unit="bytesSI",
+        )
+    )
+
+    panels.append(
+        table_panel(
+            23,
+            "Astreae per-node usage",
+            '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)',
+            {"h": 8, "w": 12, "x": 0, "y": 51},
+            unit="percent",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        table_panel(
+            24,
+            "Asteria per-node usage",
+            '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)',
+            {"h": 8, "w": 12, "x": 12, "y": 51},
+            unit="percent",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+
+    panels.append(
+        text_panel(
+            25,
+            "About this dashboard",
+            "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders",
+            {"h": 5, "w": 24, "x": 0, "y": 59},
+        )
+    )
+
+    return {
+        "uid": "atlas-overview",
+        "title": "Atlas Overview",
+        "annotations": {
+            "list": [
+                {
+                    "builtIn": 1,
+                    "datasource": {"type": "datasource", "uid": "grafana"},
+                    "enable": True,
+                    "hide": True,
+                    "iconColor": "rgba(0, 211, 255, 1)",
+                    "name": "Annotations & Alerts",
+                    "type": "dashboard",
+                }
+            ]
+        },
+        "editable": False,
+        "folderUid": "atlas-overview",
+        "graphTooltip": 0,
+        "links": [
+            {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
+            {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
+            {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
+        ],
+        "panels": panels,
+        "refresh": "30s",
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "overview"],
+        "templating": {"list": []},
+        "time": {"from": "now-12h", "to": "now"},
+    }
+
+
+def build_pods_dashboard():
+    panels = []
+    panels.append(
+        table_panel(
+            1,
+            "Pods not running",
+            "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+            {"h": 10, "w": 24, "x": 0, "y": 0},
+            unit="s",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        table_panel(
+            2,
+            "CrashLoop / ImagePull",
+            "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
+            {"h": 10, "w": 24, "x": 0, "y": 10},
+            unit="s",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        table_panel(
+            3,
+            "Terminating pods",
+            "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+            {"h": 10, "w": 24, "x": 0, "y": 20},
+            unit="s",
+            transformations=[
+                {"id": "labelsToFields", "options": {}} ,
+                {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
+            ],
+        )
+    )
+    return {
+        "uid": "atlas-pods",
+        "title": "Atlas Pods",
+        "folderUid": "atlas-pods",
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-12h", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "pods"],
+    }
+
+
+def build_nodes_dashboard():
+    panels = []
+    panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0}))
+    panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0}))
+    panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
+    panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
+    panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
+    panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
+    panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d"))
+    return {
+        "uid": "atlas-nodes",
+        "title": "Atlas Nodes",
+        "folderUid": "atlas-nodes",
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-12h", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "nodes"],
+    }
+
+
+def build_storage_dashboard():
+    panels = []
+    panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent"))
+    panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent"))
+    panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI"))
+    panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI"))
+    panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d"))
+    panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
+    panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
+    return {
+        "uid": "atlas-storage",
+        "title": "Atlas Storage",
+        "folderUid": "atlas-storage",
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-12h", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "storage"],
+    }
+
+
+DASHBOARDS = {
+    "atlas-overview": {
+        "builder": build_overview,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
+    },
+    "atlas-pods": {
+        "builder": build_pods_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
+    },
+    "atlas-nodes": {
+        "builder": build_nodes_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
+    },
+    "atlas-storage": {
+        "builder": build_storage_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
+    },
+}
+
+
+def write_json(uid: str, data: dict) -> None:
+    DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
+    path = DASHBOARD_DIR / f"{uid}.json"
+    path.write_text(json.dumps(data, indent=2) + "\n")
+
+
+def render_configmap(uid: str, data: dict) -> None:
+    json_path = DASHBOARD_DIR / f"{uid}.json"
+    payload = json.dumps(json.loads(json_path.read_text()), indent=2)
+    indented = "\n".join("    " + line for line in payload.splitlines())
+    output_path = data["configmap"]
+    content = CONFIG_TEMPLATE.format(
+        relative_path=output_path.relative_to(ROOT),
+        name=output_path.stem,
+        key=json_path.name,
+        payload=indented,
+    )
+    output_path.write_text(content)
+    print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
+    args = parser.parse_args()
+
+    if args.build:
+        for uid, info in DASHBOARDS.items():
+            write_json(uid, info["builder"]())
+
+    for uid, info in DASHBOARDS.items():
+        render_configmap(uid, info)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json
new file mode 100644
index 0000000..d3393a9
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-nodes.json
@@ -0,0 +1,369 @@
+{
+  "uid": "atlas-nodes",
+  "title": "Atlas Nodes",
+  "folderUid": "atlas-nodes",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Node count",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "count(kube_node_info)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Ready nodes",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Control plane CPU avg",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value_and_name"
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Control plane RAM avg",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value_and_name"
+      }
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Node CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 24,
+        "x": 0,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 6,
+      "type": "timeseries",
+      "title": "Node RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 24,
+        "x": 0,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 7,
+      "type": "timeseries",
+      "title": "Root filesystem",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 24,
+        "x": 0,
+        "y": 23
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "timeFrom": "7d"
+    }
+  ],
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "nodes"
+  ]
+}
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
new file mode 100644
index 0000000..d7a0d27
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -0,0 +1,1270 @@
+{
+  "uid": "atlas-overview",
+  "title": "Atlas Overview",
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": false,
+  "folderUid": "atlas-overview",
+  "graphTooltip": 0,
+  "links": [
+    {
+      "title": "Pods dashboard",
+      "type": "dashboard",
+      "dashboardUid": "atlas-pods",
+      "keepTime": false
+    },
+    {
+      "title": "Nodes dashboard",
+      "type": "dashboard",
+      "dashboardUid": "atlas-nodes",
+      "keepTime": false
+    },
+    {
+      "title": "Storage dashboard",
+      "type": "dashboard",
+      "dashboardUid": "atlas-storage",
+      "keepTime": false
+    }
+  ],
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Running pods",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Ready nodes",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 4,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Control plane ready",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 8,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Control plane schedulable",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "Problem pods",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 16,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Stuck terminating",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 20,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 7,
+      "type": "stat",
+      "title": "Hottest node: CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 24,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value_and_name"
+      }
+    },
+    {
+      "id": 8,
+      "type": "stat",
+      "title": "Hottest node: RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 28,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value_and_name"
+      }
+    },
+    {
+      "id": 9,
+      "type": "piechart",
+      "title": "Namespace CPU share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 10,
+      "type": "piechart",
+      "title": "Namespace RAM share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 11,
+      "type": "timeseries",
+      "title": "Cluster node CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 12,
+      "type": "timeseries",
+      "title": "Cluster node RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 13,
+      "type": "table",
+      "title": "Problem pods (details)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 22
+      },
+      "targets": [
+        {
+          "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 14,
+      "type": "table",
+      "title": "Terminating >10m",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 22
+      },
+      "targets": [
+        {
+          "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "filterByValue",
+          "options": {
+            "match": "Value",
+            "operator": "gt",
+            "value": 600
+          }
+        }
+      ]
+    },
+    {
+      "id": 15,
+      "type": "timeseries",
+      "title": "Control plane CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 30
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 16,
+      "type": "timeseries",
+      "title": "Control plane RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 30
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 17,
+      "type": "timeseries",
+      "title": "Root filesystem usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 37
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "timeFrom": "7d"
+    },
+    {
+      "id": 18,
+      "type": "bargauge",
+      "title": "Nodes closest to full root disks",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 37
+      },
+      "targets": [
+        {
+          "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "orange",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 19,
+      "type": "stat",
+      "title": "Astreae usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 0,
+        "y": 45
+      },
+      "targets": [
+        {
+          "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 20,
+      "type": "stat",
+      "title": "Asteria usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 6,
+        "y": 45
+      },
+      "targets": [
+        {
+          "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 21,
+      "type": "stat",
+      "title": "Astreae free",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 12,
+        "y": 45
+      },
+      "targets": [
+        {
+          "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "bytesSI"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 22,
+      "type": "stat",
+      "title": "Asteria free",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 18,
+        "y": 45
+      },
+      "targets": [
+        {
+          "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "bytesSI"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 23,
+      "type": "table",
+      "title": "Astreae per-node usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 51
+      },
+      "targets": [
+        {
+          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 24,
+      "type": "table",
+      "title": "Asteria per-node usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 51
+      },
+      "targets": [
+        {
+          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 25,
+      "type": "text",
+      "title": "About this dashboard",
+      "gridPos": {
+        "h": 5,
+        "w": 24,
+        "x": 0,
+        "y": 59
+      },
+      "datasource": null,
+      "options": {
+        "mode": "markdown",
+        "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders"
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "overview"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  }
+}
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
new file mode 100644
index 0000000..91f80eb
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -0,0 +1,137 @@
+{
+  "uid": "atlas-pods",
+  "title": "Atlas Pods",
+  "folderUid": "atlas-pods",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "table",
+      "title": "Pods not running",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "type": "table",
+      "title": "CrashLoop / ImagePull",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 24,
+        "x": 0,
+        "y": 10
+      },
+      "targets": [
+        {
+          "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "type": "table",
+      "title": "Terminating pods",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 24,
+        "x": 0,
+        "y": 20
+      },
+      "targets": [
+        {
+          "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "filterByValue",
+          "options": {
+            "match": "Value",
+            "operator": "gt",
+            "value": 600
+          }
+        }
+      ]
+    }
+  ],
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "pods"
+  ]
+}
diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json
new file mode 100644
index 0000000..aa1948d
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-storage.json
@@ -0,0 +1,359 @@
+{
+  "uid": "atlas-storage",
+  "title": "Atlas Storage",
+  "folderUid": "atlas-storage",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Astreae usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Asteria usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Astreae free",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "bytesSI"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Asteria free",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "bytesSI"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Root filesystem",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 24,
+        "x": 0,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "timeFrom": "30d"
+    },
+    {
+      "id": 6,
+      "type": "table",
+      "title": "Astreae nodes",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 12,
+        "x": 0,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "type": "table",
+      "title": "Asteria nodes",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 12,
+        "x": 12,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    }
+  ],
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "storage"
+  ]
+}
diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-nodes.yaml
similarity index 53%
rename from services/monitoring/grafana-dashboard-sre.yaml
rename to services/monitoring/grafana-dashboard-nodes.yaml
index d5d8dca..516f207 100644
--- a/services/monitoring/grafana-dashboard-sre.yaml
+++ b/services/monitoring/grafana-dashboard-nodes.yaml
@@ -1,38 +1,22 @@
-# services/monitoring/grafana-dashboard-sre.yaml
+# services/monitoring/grafana-dashboard-nodes.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: grafana-dashboard-sre
+  name: grafana-dashboard-nodes
   labels:
     grafana_dashboard: "1"
 data:
-  atlas-sre-overview.json: |
+  atlas-nodes.json: |
     {
-      "annotations": {
-        "list": [
-          {
-            "builtIn": 1,
-            "datasource": {
-              "type": "datasource",
-              "uid": "grafana"
-            },
-            "enable": true,
-            "hide": true,
-            "iconColor": "rgba(0, 211, 255, 1)",
-            "name": "Annotations & Alerts",
-            "type": "dashboard"
-          }
-        ]
-      },
+      "uid": "atlas-nodes",
+      "title": "Atlas Nodes",
+      "folderUid": "atlas-nodes",
       "editable": true,
-      "folderUid": "atlas-sre",
-      "graphTooltip": 0,
-      "links": [],
       "panels": [
         {
           "id": 1,
           "type": "stat",
-          "title": "Ready nodes",
+          "title": "Node count",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -45,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100",
+              "expr": "count(kube_node_info)",
               "refId": "A"
             }
           ],
@@ -56,23 +40,19 @@ data:
               },
               "mappings": [],
               "thresholds": {
-                "mode": "percentage",
+                "mode": "absolute",
                 "steps": [
                   {
-                    "color": "red",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
-                  {
-                    "color": "yellow",
-                    "value": 95
-                  },
                   {
                     "color": "green",
-                    "value": 99
+                    "value": 1
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "none"
             },
             "overrides": []
           },
@@ -93,7 +73,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Pending pods",
+          "title": "Ready nodes",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -106,7 +86,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
               "refId": "A"
             }
           ],
@@ -120,16 +100,12 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
                   {
-                    "color": "yellow",
-                    "value": 3
-                  },
-                  {
-                    "color": "red",
-                    "value": 10
+                    "color": "green",
+                    "value": 1
                   }
                 ]
               },
@@ -154,7 +130,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Unavailable deployment replicas",
+          "title": "Control plane CPU avg",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -167,8 +143,9 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_deployment_status_replicas_unavailable)",
-              "refId": "A"
+              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -181,20 +158,16 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
                   {
-                    "color": "yellow",
+                    "color": "green",
                     "value": 1
-                  },
-                  {
-                    "color": "red",
-                    "value": 3
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -209,13 +182,13 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "textMode": "value_and_name"
           }
         },
         {
           "id": 4,
           "type": "stat",
-          "title": "Active alerts",
+          "title": "Control plane RAM avg",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -228,8 +201,9 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(ALERTS{alertstate=\"firing\"})",
-              "refId": "A"
+              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -242,20 +216,16 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
                   {
-                    "color": "yellow",
+                    "color": "green",
                     "value": 1
-                  },
-                  {
-                    "color": "red",
-                    "value": 3
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -270,20 +240,20 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "textMode": "value_and_name"
           }
         },
         {
           "id": 5,
           "type": "timeseries",
-          "title": "Node CPU usage",
+          "title": "Node CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
+            "w": 24,
             "x": 0,
             "y": 5
           },
@@ -303,7 +273,10 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
             },
             "tooltip": {
               "mode": "multi"
@@ -313,16 +286,16 @@ data:
         {
           "id": 6,
           "type": "timeseries",
-          "title": "Node memory usage",
+          "title": "Node RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
-            "x": 12,
-            "y": 5
+            "w": 24,
+            "x": 0,
+            "y": 14
           },
           "targets": [
             {
@@ -340,7 +313,10 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
             },
             "tooltip": {
               "mode": "multi"
@@ -350,201 +326,22 @@ data:
         {
           "id": 7,
           "type": "timeseries",
-          "title": "Top pod CPU (5m avg)",
+          "title": "Root filesystem",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
-            "x": 0,
-            "y": 14
-          },
-          "targets": [
-            {
-              "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))",
-              "refId": "A",
-              "legendFormat": "{{namespace}}/{{pod}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "cores"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          }
-        },
-        {
-          "id": 8,
-          "type": "timeseries",
-          "title": "Top pod memory working set",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 12,
-            "x": 12,
-            "y": 14
-          },
-          "targets": [
-            {
-              "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))",
-              "refId": "A",
-              "legendFormat": "{{namespace}}/{{pod}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "bytes"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          }
-        },
-        {
-          "id": 9,
-          "type": "bargauge",
-          "title": "Namespace restart rate (6h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
+            "w": 24,
             "x": 0,
             "y": 23
           },
           "targets": [
             {
-              "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "displayMode": "gradient",
-            "orientation": "horizontal",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          }
-        },
-        {
-          "id": 10,
-          "type": "table",
-          "title": "Deployments missing replicas",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 23
-          },
-          "targets": [
-            {
-              "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          }
-        },
-        {
-          "id": 11,
-          "type": "timeseries",
-          "title": "Pod phase breakdown",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 12,
-            "x": 0,
-            "y": 31
-          },
-          "targets": [
-            {
-              "expr": "sum(kube_pod_status_phase) by (phase)",
+              "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
               "refId": "A",
-              "legendFormat": "{{phase}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          }
-        },
-        {
-          "id": 12,
-          "type": "timeseries",
-          "title": "PVC usage (top 8)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 12,
-            "x": 12,
-            "y": 31
-          },
-          "targets": [
-            {
-              "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))",
-              "refId": "A",
-              "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -556,28 +353,26 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right"
             },
             "tooltip": {
               "mode": "multi"
             }
-          }
+          },
+          "timeFrom": "7d"
         }
       ],
-      "schemaVersion": 39,
-      "style": "dark",
-      "tags": [
-        "atlas",
-        "sre"
-      ],
-      "templating": {
-        "list": []
-      },
       "time": {
         "from": "now-12h",
         "to": "now"
       },
-      "title": "Atlas SRE Overview",
-      "uid": "atlas-sre",
-      "version": 4
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "nodes"
+      ]
     }
diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-overview.yaml
similarity index 67%
rename from services/monitoring/grafana-dashboard-public.yaml
rename to services/monitoring/grafana-dashboard-overview.yaml
index 35fa124..a20e05a 100644
--- a/services/monitoring/grafana-dashboard-public.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1,13 +1,15 @@
-# services/monitoring/grafana-dashboard-public.yaml
+# services/monitoring/grafana-dashboard-overview.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: grafana-dashboard-public
+  name: grafana-dashboard-overview
   labels:
     grafana_dashboard: "1"
 data:
-  atlas-public-overview.json: |
+  atlas-overview.json: |
     {
+      "uid": "atlas-overview",
+      "title": "Atlas Overview",
       "annotations": {
         "list": [
           {
@@ -25,9 +27,28 @@ data:
         ]
       },
       "editable": false,
-      "folderUid": "atlas-public",
+      "folderUid": "atlas-overview",
       "graphTooltip": 0,
-      "links": [],
+      "links": [
+        {
+          "title": "Pods dashboard",
+          "type": "dashboard",
+          "dashboardUid": "atlas-pods",
+          "keepTime": false
+        },
+        {
+          "title": "Nodes dashboard",
+          "type": "dashboard",
+          "dashboardUid": "atlas-nodes",
+          "keepTime": false
+        },
+        {
+          "title": "Storage dashboard",
+          "type": "dashboard",
+          "dashboardUid": "atlas-storage",
+          "keepTime": false
+        }
+      ],
       "panels": [
         {
           "id": 1,
@@ -146,7 +167,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Cluster nodes",
+          "title": "Control plane ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -159,7 +180,7 @@ data:
           },
           "targets": [
             {
-              "expr": "count(kube_node_info)",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
               "refId": "A"
             }
           ],
@@ -203,7 +224,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Hottest node CPU",
+          "title": "Control plane schedulable",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -216,10 +237,182 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))",
+              "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)",
               "refId": "A"
             }
           ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 5,
+          "type": "stat",
+          "title": "Problem pods",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 4,
+            "x": 16,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 6,
+          "type": "stat",
+          "title": "Stuck terminating",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 4,
+            "x": 20,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 7,
+          "type": "stat",
+          "title": "Hottest node: CPU",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 4,
+            "x": 24,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -262,9 +455,9 @@ data:
           }
         },
         {
-          "id": 5,
+          "id": 8,
           "type": "stat",
-          "title": "Hottest node memory",
+          "title": "Hottest node: RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -272,13 +465,14 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 16,
+            "x": 28,
             "y": 0
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))",
-              "refId": "A"
+              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -296,11 +490,11 @@ data:
                   },
                   {
                     "color": "yellow",
-                    "value": 75
+                    "value": 70
                   },
                   {
                     "color": "red",
-                    "value": 90
+                    "value": 85
                   }
                 ]
               },
@@ -323,68 +517,7 @@ data:
           }
         },
         {
-          "id": 6,
-          "type": "stat",
-          "title": "Failed pods (24h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 5,
-            "w": 4,
-            "x": 20,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h]))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 1
-                  },
-                  {
-                    "color": "red",
-                    "value": 3
-                  }
-                ]
-              },
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 7,
+          "id": 9,
           "type": "piechart",
           "title": "Namespace CPU share",
           "datasource": {
@@ -400,12 +533,13 @@ data:
           "targets": [
             {
               "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "cores"
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -425,9 +559,9 @@ data:
           }
         },
         {
-          "id": 8,
+          "id": 10,
           "type": "piechart",
-          "title": "Namespace memory share",
+          "title": "Namespace RAM share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -441,12 +575,13 @@ data:
           "targets": [
             {
               "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "bytes"
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -455,7 +590,7 @@ data:
               "displayMode": "list",
               "placement": "right"
             },
-            "pieType": "donut",
+            "pieType": "pie",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -466,9 +601,9 @@ data:
           }
         },
         {
-          "id": 9,
+          "id": 11,
           "type": "timeseries",
-          "title": "Node CPU usage (per node)",
+          "title": "Cluster node CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -495,7 +630,10 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
             },
             "tooltip": {
               "mode": "multi"
@@ -503,9 +641,9 @@ data:
           }
         },
         {
-          "id": 10,
+          "id": 12,
           "type": "timeseries",
-          "title": "Node memory usage (per node)",
+          "title": "Cluster node RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -532,80 +670,20 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
             },
             "tooltip": {
               "mode": "multi"
             }
           }
         },
-        {
-          "id": 11,
-          "type": "table",
-          "title": "Key service availability",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 0,
-            "y": 22
-          },
-          "targets": [
-            {
-              "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"})), \"service\", \"$1\", \"deployment\", \"(.*)\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"})), \"service\", \"$1\", \"statefulset\", \"(.*)\")",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "custom": {
-                "align": "auto"
-              },
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          }
-        },
-        {
-          "id": 12,
-          "type": "table",
-          "title": "Failed pods by namespace (24h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 12,
-            "y": 22
-          },
-          "targets": [
-            {
-              "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[24h])))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          }
-        },
         {
           "id": 13,
-          "type": "timeseries",
-          "title": "Root filesystem usage per node",
+          "type": "table",
+          "title": "Problem pods (details)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -614,11 +692,91 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 29
+            "y": 22
           },
           "targets": [
             {
-              "expr": "avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        },
+        {
+          "id": 14,
+          "type": "table",
+          "title": "Terminating >10m",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 22
+          },
+          "targets": [
+            {
+              "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "filterByValue",
+              "options": {
+                "match": "Value",
+                "operator": "gt",
+                "value": 600
+              }
+            }
+          ]
+        },
+        {
+          "id": 15,
+          "type": "timeseries",
+          "title": "Control plane CPU",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 30
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -640,7 +798,85 @@ data:
           }
         },
         {
-          "id": 14,
+          "id": 16,
+          "type": "timeseries",
+          "title": "Control plane RAM",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 30
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 17,
+          "type": "timeseries",
+          "title": "Root filesystem usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 37
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "timeFrom": "7d"
+        },
+        {
+          "id": 18,
           "type": "bargauge",
           "title": "Nodes closest to full root disks",
           "datasource": {
@@ -651,19 +887,41 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 29
+            "y": 37
           },
           "targets": [
             {
-              "expr": "topk(8, avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))",
-              "refId": "A"
+              "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
             "defaults": {
               "unit": "percent",
               "min": 0,
-              "max": 100
+              "max": 100,
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "orange",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              }
             },
             "overrides": []
           },
@@ -680,7 +938,7 @@ data:
           }
         },
         {
-          "id": 15,
+          "id": 19,
           "type": "stat",
           "title": "Astreae usage",
           "datasource": {
@@ -688,14 +946,14 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 6,
             "w": 6,
             "x": 0,
-            "y": 37
+            "y": 45
           },
           "targets": [
             {
-              "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"})) * 100",
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
               "refId": "A"
             }
           ],
@@ -741,7 +999,7 @@ data:
           }
         },
         {
-          "id": 16,
+          "id": 20,
           "type": "stat",
           "title": "Asteria usage",
           "datasource": {
@@ -749,14 +1007,14 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 6,
             "w": 6,
             "x": 6,
-            "y": 37
+            "y": 45
           },
           "targets": [
             {
-              "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"})) * 100",
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
               "refId": "A"
             }
           ],
@@ -802,22 +1060,22 @@ data:
           }
         },
         {
-          "id": 17,
+          "id": 21,
           "type": "stat",
-          "title": "Astreae schedulable",
+          "title": "Astreae free",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 6,
             "w": 6,
             "x": 12,
-            "y": 37
+            "y": 45
           },
           "targets": [
             {
-              "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"astreae-.*\"}))",
+              "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
               "refId": "A"
             }
           ],
@@ -859,22 +1117,22 @@ data:
           }
         },
         {
-          "id": 18,
+          "id": 22,
           "type": "stat",
-          "title": "Asteria schedulable",
+          "title": "Asteria free",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 6,
             "w": 6,
             "x": 18,
-            "y": 37
+            "y": 45
           },
           "targets": [
             {
-              "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"asteria-.*\"}))",
+              "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
               "refId": "A"
             }
           ],
@@ -916,112 +1174,91 @@ data:
           }
         },
         {
-          "id": 19,
-          "type": "piechart",
-          "title": "Longhorn node readiness",
+          "id": 23,
+          "type": "table",
+          "title": "Astreae per-node usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 8,
             "w": 12,
             "x": 0,
-            "y": 44
+            "y": 51
           },
           "targets": [
             {
-              "expr": "sum(longhorn_node_status{condition=\"ready\"})",
-              "refId": "A",
-              "legendFormat": "Ready"
-            },
-            {
-              "expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\"ready\"}))",
-              "refId": "B",
-              "legendFormat": "Offline"
+              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "none"
+              "unit": "percent"
             },
             "overrides": []
           },
           "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "right"
-            },
-            "pieType": "donut",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
             }
-          }
+          ]
         },
         {
-          "id": 20,
-          "type": "piechart",
-          "title": "Longhorn disk schedulability",
+          "id": 24,
+          "type": "table",
+          "title": "Asteria per-node usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 8,
             "w": 12,
             "x": 12,
-            "y": 44
+            "y": 51
           },
           "targets": [
             {
-              "expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"}))",
-              "refId": "A",
-              "legendFormat": "Schedulable"
-            },
-            {
-              "expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\"ready\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"})))",
-              "refId": "B",
-              "legendFormat": "Blocked"
+              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "none"
+              "unit": "percent"
             },
             "overrides": []
           },
           "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "right"
-            },
-            "pieType": "donut",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
             }
-          }
+          ]
         },
         {
-          "id": 21,
+          "id": 25,
           "type": "text",
           "title": "About this dashboard",
           "gridPos": {
             "h": 5,
             "w": 24,
             "x": 0,
-            "y": 51
+            "y": 59
           },
+          "datasource": null,
           "options": {
             "mode": "markdown",
-            "content": "### Atlas at a glance\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\n- Login for the SRE view with alert routing, Longhorn drilldowns, and workload burn rates"
+            "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders"
           }
         }
       ],
@@ -1030,7 +1267,7 @@ data:
       "style": "dark",
       "tags": [
         "atlas",
-        "public"
+        "overview"
       ],
       "templating": {
         "list": []
@@ -1038,8 +1275,5 @@ data:
       "time": {
         "from": "now-12h",
         "to": "now"
-      },
-      "title": "Atlas Public Overview",
-      "uid": "atlas-public",
-      "version": 5
+      }
     }
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
new file mode 100644
index 0000000..3b1f5da
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -0,0 +1,146 @@
+# services/monitoring/grafana-dashboard-pods.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-pods
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-pods.json: |
+    {
+      "uid": "atlas-pods",
+      "title": "Atlas Pods",
+      "folderUid": "atlas-pods",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "table",
+          "title": "Pods not running",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 10,
+            "w": 24,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "type": "table",
+          "title": "CrashLoop / ImagePull",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 10,
+            "w": 24,
+            "x": 0,
+            "y": 10
+          },
+          "targets": [
+            {
+              "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        },
+        {
+          "id": 3,
+          "type": "table",
+          "title": "Terminating pods",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 10,
+            "w": 24,
+            "x": 0,
+            "y": 20
+          },
+          "targets": [
+            {
+              "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "filterByValue",
+              "options": {
+                "match": "Value",
+                "operator": "gt",
+                "value": 600
+              }
+            }
+          ]
+        }
+      ],
+      "time": {
+        "from": "now-12h",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "pods"
+      ]
+    }
diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml
new file mode 100644
index 0000000..5b22804
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-storage.yaml
@@ -0,0 +1,368 @@
+# services/monitoring/grafana-dashboard-storage.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-storage
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-storage.json: |
+    {
+      "uid": "atlas-storage",
+      "title": "Atlas Storage",
+      "folderUid": "atlas-storage",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "stat",
+          "title": "Astreae usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 2,
+          "type": "stat",
+          "title": "Asteria usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 6,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 3,
+          "type": "stat",
+          "title": "Astreae free",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 12,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "bytesSI"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 4,
+          "type": "stat",
+          "title": "Asteria free",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 18,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "bytesSI"
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 5,
+          "type": "timeseries",
+          "title": "Root filesystem",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 24,
+            "x": 0,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "timeFrom": "30d"
+        },
+        {
+          "id": 6,
+          "type": "table",
+          "title": "Astreae nodes",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 10,
+            "w": 12,
+            "x": 0,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        },
+        {
+          "id": 7,
+          "type": "table",
+          "title": "Asteria nodes",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 10,
+            "w": 12,
+            "x": 12,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        }
+      ],
+      "time": {
+        "from": "now-12h",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "storage"
+      ]
+    }
diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml
index 503aaee..d390679 100644
--- a/services/monitoring/grafana-folders.yaml
+++ b/services/monitoring/grafana-folders.yaml
@@ -10,8 +10,8 @@ data:
   folders.yaml: |
     apiVersion: 1
     folders:
-      - uid: atlas-public
-        title: Atlas Public
+      - uid: atlas-overview
+        title: Atlas Overview
         permissions:
           - role: Viewer
             permission: View
@@ -19,8 +19,22 @@ data:
             permission: Edit
           - role: Admin
             permission: Admin
-      - uid: atlas-sre
-        title: Atlas SRE
+      - uid: atlas-pods
+        title: Atlas Pods
+        permissions:
+          - role: Editor
+            permission: View
+          - role: Admin
+            permission: Admin
+      - uid: atlas-nodes
+        title: Atlas Nodes
+        permissions:
+          - role: Editor
+            permission: View
+          - role: Admin
+            permission: Admin
+      - uid: atlas-storage
+        title: Atlas Storage
         permissions:
           - role: Editor
             permission: View
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 4efae70..e23f903 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -244,8 +244,8 @@ spec:
       GF_SECURITY_ALLOW_EMBEDDING: "true"
     grafana.ini:
       server:
-        domain: atlas.metrics.bstein.dev
-        root_url: https://atlas.metrics.bstein.dev/
+        domain: metrics.bstein.dev
+        root_url: https://metrics.bstein.dev/
       auth.anonymous:
         hide_version: true
       users:
@@ -256,12 +256,12 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - atlas.metrics.bstein.dev
+        - metrics.bstein.dev
       path: /
       tls:
-        - secretName: grafana-atlas-metrics-tls
+        - secretName: grafana-metrics-tls
           hosts:
-            - atlas.metrics.bstein.dev
+            - metrics.bstein.dev
     datasources:
       datasources.yaml:
         apiVersion: 1
@@ -278,25 +278,43 @@ spec:
       dashboardproviders.yaml:
         apiVersion: 1
         providers:
-          - name: public
+          - name: overview
             orgId: 1
-            folder: Atlas Public
+            folder: Atlas Overview
             type: file
             disableDeletion: false
             editable: false
             options:
-              path: /var/lib/grafana/dashboards/public
-          - name: sre
+              path: /var/lib/grafana/dashboards/overview
+          - name: pods
             orgId: 1
-            folder: Atlas SRE
+            folder: Atlas Pods
             type: file
             disableDeletion: false
             editable: true
             options:
-              path: /var/lib/grafana/dashboards/sre
+              path: /var/lib/grafana/dashboards/pods
+          - name: nodes
+            orgId: 1
+            folder: Atlas Nodes
+            type: file
+            disableDeletion: false
+            editable: true
+            options:
+              path: /var/lib/grafana/dashboards/nodes
+          - name: storage
+            orgId: 1
+            folder: Atlas Storage
+            type: file
+            disableDeletion: false
+            editable: true
+            options:
+              path: /var/lib/grafana/dashboards/storage
     dashboardsConfigMaps:
-      public: grafana-dashboard-public
-      sre: grafana-dashboard-sre
+      overview: grafana-dashboard-overview
+      pods: grafana-dashboard-pods
+      nodes: grafana-dashboard-nodes
+      storage: grafana-dashboard-storage
     extraConfigmapMounts:
       - name: grafana-folders
         mountPath: /etc/grafana/provisioning/folders
@@ -327,14 +345,14 @@ spec:
       annotations:
         cert-manager.io/cluster-issuer: letsencrypt
       hosts:
-        - host: atlas.alerts.bstein.dev
+        - host: alerts.bstein.dev
           paths:
             - path: /
               pathType: Prefix
       tls:
         - secretName: alerts-bstein-dev-tls
           hosts:
-            - atlas.alerts.bstein.dev
+            - alerts.bstein.dev
     config:
       global:
         resolve_timeout: 5m
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 73e7d23..282ee4f 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -5,7 +5,9 @@ namespace: monitoring
 resources:
   - namespace.yaml
   - rbac.yaml
-  - grafana-dashboard-public.yaml
-  - grafana-dashboard-sre.yaml
+  - grafana-dashboard-overview.yaml
+  - grafana-dashboard-pods.yaml
+  - grafana-dashboard-nodes.yaml
+  - grafana-dashboard-storage.yaml
   - grafana-folders.yaml
   - helmrelease.yaml

From 8f5781d3cf22aef5018d84af6b1e592c26c830a9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 16:27:38 -0300
Subject: [PATCH 14/71] monitoring: rebuild atlas dashboards

---
 scripts/render_dashboards.py                  | 1009 +++++++++++++----
 .../monitoring/dashboards/atlas-network.json  |  384 +++++++
 .../monitoring/dashboards/atlas-nodes.json    |  212 ++--
 .../monitoring/dashboards/atlas-overview.json |  872 ++++++++------
 .../monitoring/dashboards/atlas-pods.json     |  260 ++++-
 .../monitoring/dashboards/atlas-storage.json  |  138 ++-
 .../monitoring/grafana-dashboard-network.yaml |  393 +++++++
 .../monitoring/grafana-dashboard-nodes.yaml   |  212 ++--
 .../grafana-dashboard-overview.yaml           |  872 ++++++++------
 .../monitoring/grafana-dashboard-pods.yaml    |  260 ++++-
 .../monitoring/grafana-dashboard-storage.yaml |  138 ++-
 services/monitoring/grafana-folders.yaml      |   18 +-
 services/monitoring/helmrelease.yaml          |   15 +-
 services/monitoring/kustomization.yaml        |    1 +
 14 files changed, 3559 insertions(+), 1225 deletions(-)
 mode change 100755 => 100644 scripts/render_dashboards.py
 create mode 100644 services/monitoring/dashboards/atlas-network.json
 create mode 100644 services/monitoring/grafana-dashboard-network.yaml

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
old mode 100755
new mode 100644
index fa9ef58..67e486a
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -1,15 +1,20 @@
 #!/usr/bin/env python3
-"""Generate Grafana dashboards and render them into ConfigMaps.
+"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
 
 Usage:
-    python scripts/render_dashboards.py --build   # rebuild JSON + ConfigMaps
-    python scripts/render_dashboards.py           # just render ConfigMaps
+  scripts/render_dashboards.py --build   # rebuild JSON + ConfigMaps
+  scripts/render_dashboards.py           # re-render ConfigMaps from JSON
 """
+
 import argparse
 import json
 import textwrap
 from pathlib import Path
 
+# ---------------------------------------------------------------------------
+# Paths, folders, and shared metadata
+# ---------------------------------------------------------------------------
+
 ROOT = Path(__file__).resolve().parents[1]
 DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
 CONFIG_TEMPLATE = textwrap.dedent(
@@ -27,15 +32,194 @@ data:
 )
 
 PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
+PUBLIC_FOLDER = "atlas-overview"
+PRIVATE_FOLDER = "atlas-internal"
+
+PERCENT_THRESHOLDS = {
+    "mode": "percentage",
+    "steps": [
+        {"color": "green", "value": None},
+        {"color": "yellow", "value": 70},
+        {"color": "red", "value": 85},
+    ],
+}
+
+# ---------------------------------------------------------------------------
+# Cluster metadata
+# ---------------------------------------------------------------------------
+
+CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
+CONTROL_DEPENDENCIES = ["titan-db"]
+CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
+WORKER_NODES = [
+    "titan-04",
+    "titan-05",
+    "titan-06",
+    "titan-07",
+    "titan-08",
+    "titan-09",
+    "titan-10",
+    "titan-11",
+    "titan-12",
+    "titan-13",
+    "titan-14",
+    "titan-15",
+    "titan-16",
+    "titan-17",
+    "titan-18",
+    "titan-19",
+    "titan-22",
+    "titan-24",
+]
+
+CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
+CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
+WORKER_REGEX = "|".join(WORKER_NODES)
+CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
+WORKER_TOTAL = len(WORKER_NODES)
+CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
+WORKER_SUFFIX = f"/{WORKER_TOTAL}"
+CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring"
+
+# ---------------------------------------------------------------------------
+# PromQL helpers
+# ---------------------------------------------------------------------------
+
+NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
 
 
-# --------------------------------------------------------------------------- #
-# Panel helper factories
-# --------------------------------------------------------------------------- #
+def node_filter(regex):
+    """Return a selector that evaluates to 1 for nodes matching the regex."""
+    return (
+        f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
+        '"node", "$1", "nodename", "(.*)")'
+    )
 
 
-def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
-               text_mode="value", legend=None):
+def scoped_node_expr(base, scope=""):
+    """Attach nodename metadata and optionally filter to a scope regex."""
+    expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
+    if scope:
+        expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
+    return expr
+
+
+def node_cpu_expr(scope=""):
+    idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
+    base = f"(1 - {idle}) * 100"
+    return scoped_node_expr(base, scope)
+
+
+def node_mem_expr(scope=""):
+    usage = (
+        "avg by (instance) ("
+        "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
+        "/ node_memory_MemTotal_bytes * 100)"
+    )
+    return scoped_node_expr(usage, scope)
+
+
+def filesystem_usage_expr(mount, scope=""):
+    base = (
+        f'avg by (instance) ('
+        f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
+        f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
+    )
+    return scoped_node_expr(base, scope)
+
+
+def root_usage_expr(scope=""):
+    return filesystem_usage_expr("/", scope)
+
+
+def astreae_usage_expr(mount):
+    return (
+        f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
+        f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
+    )
+
+
+def astreae_free_expr(mount):
+    return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
+
+
+PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
+CRASHLOOP_EXPR = (
+    'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
+    '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
+)
+STUCK_TERMINATING_EXPR = (
+    'sum(max by (namespace,pod) (('
+    '(time() - kube_pod_deletion_timestamp{pod!=""}) > 600'
+    ') and on(namespace,pod) kube_pod_deletion_timestamp{pod!=""} > 0))'
+)
+
+PROBLEM_TABLE_EXPR = (
+    "(time() - kube_pod_created{pod!=\"\"}) "
+    "* on(namespace,pod) group_left(node) kube_pod_info "
+    "* on(namespace,pod) group_left(phase) "
+    "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
+)
+CRASHLOOP_TABLE_EXPR = (
+    "(time() - kube_pod_created{pod!=\"\"}) "
+    "* on(namespace,pod) group_left(node) kube_pod_info "
+    "* on(namespace,pod,container) group_left(reason) "
+    "max by (namespace,pod,container,reason) "
+    "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
+)
+STUCK_TABLE_EXPR = (
+    "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
+    "* on(namespace,pod) group_left(node) kube_pod_info) "
+    "and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0"
+)
+
+NAMESPACE_CPU_EXPR = (
+    'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""'
+    ',pod!=""}[5m])) by (namespace))'
+)
+NAMESPACE_RAM_EXPR = (
+    'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
+    ',pod!=""}) by (namespace))'
+)
+NET_SERIES_EXPR = (
+    'avg by (node) ('
+    'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
+    '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
+)
+NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})"
+IO_SERIES_EXPR = (
+    "avg by (node) (rate(node_disk_read_bytes_total[5m]) "
+    "+ rate(node_disk_written_bytes_total[5m]))"
+)
+IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
+NET_INGRESS_EXPR = (
+    'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
+    "or on() vector(0)"
+)
+NET_EGRESS_EXPR = (
+    'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) '
+    "or on() vector(0)"
+)
+
+# ---------------------------------------------------------------------------
+# Panel factories
+# ---------------------------------------------------------------------------
+
+
+def stat_panel(
+    panel_id,
+    title,
+    expr,
+    grid,
+    *,
+    unit="none",
+    thresholds=None,
+    text_mode="value",
+    legend=None,
+    value_suffix=None,
+    links=None,
+):
+    """Return a Grafana stat panel definition."""
     defaults = {
         "color": {"mode": "palette-classic"},
         "mappings": [],
@@ -48,7 +232,10 @@ def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
             ],
         },
         "unit": unit,
+        "custom": {"displayMode": "auto"},
     }
+    if value_suffix:
+        defaults["custom"]["valueSuffix"] = value_suffix
     panel = {
         "id": panel_id,
         "type": "stat",
@@ -67,12 +254,26 @@ def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
     }
     if legend:
         panel["targets"][0]["legendFormat"] = legend
+    if links:
+        panel["links"] = links
     return panel
 
 
-def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None,
-                     legend_display="table", legend_placement="bottom",
-                     legend_calcs=None, time_from=None):
+def timeseries_panel(
+    panel_id,
+    title,
+    expr,
+    grid,
+    *,
+    unit="none",
+    legend=None,
+    legend_display="table",
+    legend_placement="bottom",
+    legend_calcs=None,
+    time_from=None,
+    links=None,
+):
+    """Return a Grafana time-series panel definition."""
     panel = {
         "id": panel_id,
         "type": "timeseries",
@@ -95,11 +296,21 @@ def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None,
         panel["options"]["legend"]["calcs"] = legend_calcs
     if time_from:
         panel["timeFrom"] = time_from
+    if links:
+        panel["links"] = links
     return panel
 
 
-def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None,
-                description=None):
+def table_panel(
+    panel_id,
+    title,
+    expr,
+    grid,
+    *,
+    unit="none",
+    transformations=None,
+):
+    """Return a Grafana table panel definition."""
     panel = {
         "id": panel_id,
         "type": "table",
@@ -112,20 +323,25 @@ def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=Non
     }
     if transformations:
         panel["transformations"] = transformations
-    if description:
-        panel["description"] = description
     return panel
 
 
 def pie_panel(panel_id, title, expr, grid):
+    """Return a pie chart panel with readable namespace labels."""
     return {
         "id": panel_id,
         "type": "piechart",
         "title": title,
         "datasource": PROM_DS,
         "gridPos": grid,
-        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
-        "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
+        "targets": [{"expr": expr, "refId": "A"}],
+        "fieldConfig": {
+            "defaults": {
+                "unit": "percent",
+                "displayName": "{{namespace}}",
+            },
+            "overrides": [],
+        },
         "options": {
             "legend": {"displayMode": "list", "placement": "right"},
             "pieType": "pie",
@@ -145,192 +361,238 @@ def text_panel(panel_id, title, content, grid):
     }
 
 
-def node_cpu_expr(scope=""):
-    expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))"
-    if scope:
-        expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
-    return expr
+def link_to(uid):
+    return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]
 
 
-def node_mem_expr(scope=""):
-    expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))"
-    if scope:
-        expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
-    return expr
-
-
-def root_usage_expr():
-    return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)"
-
-
-def astreae_usage_expr(mount):
-    return (
-        f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
-        f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
-    )
-
-
-def astreae_free_expr(mount):
-    return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
+# ---------------------------------------------------------------------------
+# Dashboard builders
+# ---------------------------------------------------------------------------
 
 
 def build_overview():
-    thresholds_percent = {
-        "mode": "percentage",
-        "steps": [
-            {"color": "green", "value": None},
-            {"color": "yellow", "value": 70},
-            {"color": "red", "value": 85},
-        ],
-    }
     panels = []
-    stats = [
-        (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'),
-        (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'),
-        (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'),
-        (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'),
-        (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'),
-        (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'),
+
+    row1_stats = [
+        (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
+        (
+            2,
+            "Ready nodes",
+            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
+            WORKER_SUFFIX,
+            WORKER_TOTAL,
+            None,
+        ),
+        (
+            3,
+            "Control plane ready",
+            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
+            CONTROL_SUFFIX,
+            CONTROL_TOTAL,
+            None,
+        ),
+        (
+            4,
+            "Control plane workloads",
+            f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
+            None,
+            1,
+            link_to("atlas-pods"),
+        ),
+        (
+            5,
+            "Problem pods",
+            PROBLEM_PODS_EXPR,
+            None,
+            1,
+            link_to("atlas-pods"),
+        ),
+        (
+            6,
+            "Stuck terminating",
+            STUCK_TERMINATING_EXPR,
+            None,
+            1,
+            link_to("atlas-pods"),
+        ),
     ]
-    for idx, (panel_id, title, expr) in enumerate(stats):
+    for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
+        thresholds = None
+        if panel_id in (2, 3):
+            thresholds = {
+                "mode": "absolute",
+                "steps": [
+                    {"color": "red", "value": None},
+                    {"color": "green", "value": ok_value},
+                ],
+            }
+        elif panel_id >= 4:
+            thresholds = {
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "red", "value": 1},
+                ],
+            }
         panels.append(
             stat_panel(
                 panel_id,
                 title,
                 expr,
                 {"h": 5, "w": 4, "x": 4 * idx, "y": 0},
+                value_suffix=suffix,
+                thresholds=thresholds,
+                links=links,
             )
         )
-    panels.append(
-        stat_panel(
-            7,
-            "Hottest node: CPU",
-            node_cpu_expr(),
-            {"h": 5, "w": 4, "x": 24, "y": 0},
-            unit="percent",
-            thresholds=thresholds_percent,
-            text_mode="value_and_name",
-            legend="{{node}}",
-        )
-    )
-    panels.append(
-        stat_panel(
-            8,
-            "Hottest node: RAM",
-            node_mem_expr(),
-            {"h": 5, "w": 4, "x": 28, "y": 0},
-            unit="percent",
-            thresholds=thresholds_percent,
-            text_mode="value_and_name",
-            legend="{{node}}",
-        )
-    )
 
-    panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5}))
-    panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5}))
+    hottest = [
+        (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"),
+        (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"),
+        (9, "Hottest node: NET", NET_TOP_EXPR, "bytes/sec"),
+        (10, "Hottest node: I/O", IO_TOP_EXPR, "bytes/sec"),
+    ]
+    for idx, (panel_id, title, expr, unit) in enumerate(hottest):
+        panels.append(
+            stat_panel(
+                panel_id,
+                title,
+                expr,
+                {"h": 5, "w": 6, "x": 6 * idx, "y": 5},
+                unit=unit,
+                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
+                text_mode="value_and_name",
+                legend="{{node}}",
+                links=link_to("atlas-nodes"),
+            )
+        )
+
+    panels.append(
+        pie_panel(
+            11,
+            "Namespace CPU share",
+            NAMESPACE_CPU_EXPR,
+            {"h": 9, "w": 12, "x": 0, "y": 10},
+        )
+    )
+    panels.append(
+        pie_panel(
+            12,
+            "Namespace RAM share",
+            NAMESPACE_RAM_EXPR,
+            {"h": 9, "w": 12, "x": 12, "y": 10},
+        )
+    )
 
     panels.append(
         timeseries_panel(
-            11,
+            13,
             "Cluster node CPU",
             node_cpu_expr(),
-            {"h": 8, "w": 12, "x": 0, "y": 14},
+            {"h": 8, "w": 12, "x": 0, "y": 19},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
             legend_display="table",
             legend_placement="right",
+            links=link_to("atlas-nodes"),
         )
     )
     panels.append(
         timeseries_panel(
-            12,
+            14,
             "Cluster node RAM",
             node_mem_expr(),
-            {"h": 8, "w": 12, "x": 12, "y": 14},
+            {"h": 8, "w": 12, "x": 12, "y": 19},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
             legend_display="table",
             legend_placement="right",
-        )
-    )
-
-    panels.append(
-        table_panel(
-            13,
-            "Problem pods (details)",
-            "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
-            {"h": 8, "w": 12, "x": 0, "y": 22},
-            unit="s",
-            transformations=[{"id": "labelsToFields", "options": {}}],
-        )
-    )
-    panels.append(
-        table_panel(
-            14,
-            "Terminating >10m",
-            "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
-            {"h": 8, "w": 12, "x": 12, "y": 22},
-            unit="s",
-            transformations=[
-                {"id": "labelsToFields", "options": {}} ,
-                {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
-            ],
+            links=link_to("atlas-nodes"),
         )
     )
 
     panels.append(
         timeseries_panel(
             15,
-            "Control plane CPU",
-            node_cpu_expr("titan-0a|titan-0b|titan-0c"),
-            {"h": 7, "w": 12, "x": 0, "y": 30},
+            "Control plane CPU (incl. titan-db)",
+            node_cpu_expr(CONTROL_ALL_REGEX),
+            {"h": 7, "w": 12, "x": 0, "y": 27},
             unit="percent",
             legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
         )
     )
     panels.append(
         timeseries_panel(
             16,
-            "Control plane RAM",
-            node_mem_expr("titan-0a|titan-0b|titan-0c"),
-            {"h": 7, "w": 12, "x": 12, "y": 30},
+            "Control plane RAM (incl. titan-db)",
+            node_mem_expr(CONTROL_ALL_REGEX),
+            {"h": 7, "w": 12, "x": 12, "y": 27},
             unit="percent",
             legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
         )
     )
 
     panels.append(
         timeseries_panel(
             17,
+            "Cluster ingress throughput",
+            NET_INGRESS_EXPR,
+            {"h": 7, "w": 12, "x": 0, "y": 34},
+            unit="bytes/sec",
+            legend_display="list",
+            legend_placement="bottom",
+            links=link_to("atlas-network"),
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            18,
+            "Cluster egress throughput",
+            NET_EGRESS_EXPR,
+            {"h": 7, "w": 12, "x": 12, "y": 34},
+            unit="bytes/sec",
+            legend_display="list",
+            legend_placement="bottom",
+            links=link_to("atlas-network"),
+        )
+    )
+
+    panels.append(
+        timeseries_panel(
+            19,
             "Root filesystem usage",
             root_usage_expr(),
-            {"h": 8, "w": 12, "x": 0, "y": 37},
+            {"h": 8, "w": 12, "x": 0, "y": 41},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
             legend_display="table",
             legend_placement="right",
-            time_from="7d",
+            time_from="30d",
+            links=link_to("atlas-storage"),
         )
     )
-
     panels.append(
         {
-            "id": 18,
+            "id": 20,
             "type": "bargauge",
             "title": "Nodes closest to full root disks",
             "datasource": PROM_DS,
-            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
-            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
+            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}],
             "fieldConfig": {
                 "defaults": {
                     "unit": "percent",
                     "min": 0,
                     "max": 100,
                     "thresholds": {
-                        "mode": "percentage",
+                        "mode": "absolute",
                         "steps": [
                             {"color": "green", "value": None},
                             {"color": "yellow", "value": 50},
@@ -338,6 +600,7 @@ def build_overview():
                             {"color": "red", "value": 85},
                         ],
                     },
+                    "displayName": "{{node}}",
                 },
                 "overrides": [],
             },
@@ -346,143 +609,157 @@ def build_overview():
                 "orientation": "horizontal",
                 "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
             },
+            "links": link_to("atlas-storage"),
         }
     )
 
-    panels.append(
-        stat_panel(
-            19,
-            "Astreae usage",
-            astreae_usage_expr("/mnt/astreae"),
-            {"h": 6, "w": 6, "x": 0, "y": 45},
-            unit="percent",
-            thresholds=thresholds_percent,
+    storage_panels = [
+        (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
+        (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
+        (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "bytesSI"),
+        (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "bytesSI"),
+    ]
+    for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
+        panels.append(
+            stat_panel(
+                panel_id,
+                title,
+                expr,
+                {"h": 6, "w": 6, "x": 6 * idx, "y": 49},
+                unit=unit,
+                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
+                links=link_to("atlas-storage"),
+            )
         )
-    )
-    panels.append(
-        stat_panel(
-            20,
-            "Asteria usage",
-            astreae_usage_expr("/mnt/asteria"),
-            {"h": 6, "w": 6, "x": 6, "y": 45},
-            unit="percent",
-            thresholds=thresholds_percent,
-        )
-    )
-    panels.append(
-        stat_panel(
-            21,
-            "Astreae free",
-            astreae_free_expr("/mnt/astreae"),
-            {"h": 6, "w": 6, "x": 12, "y": 45},
-            unit="bytesSI",
-        )
-    )
-    panels.append(
-        stat_panel(
-            22,
-            "Asteria free",
-            astreae_free_expr("/mnt/asteria"),
-            {"h": 6, "w": 6, "x": 18, "y": 45},
-            unit="bytesSI",
-        )
-    )
-
-    panels.append(
-        table_panel(
-            23,
-            "Astreae per-node usage",
-            '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)',
-            {"h": 8, "w": 12, "x": 0, "y": 51},
-            unit="percent",
-            transformations=[{"id": "labelsToFields", "options": {}}],
-        )
-    )
-    panels.append(
-        table_panel(
-            24,
-            "Asteria per-node usage",
-            '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)',
-            {"h": 8, "w": 12, "x": 12, "y": 51},
-            unit="percent",
-            transformations=[{"id": "labelsToFields", "options": {}}],
-        )
-    )
 
     panels.append(
         text_panel(
             25,
             "About this dashboard",
-            "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders",
-            {"h": 5, "w": 24, "x": 0, "y": 59},
+            textwrap.dedent(
+                """\
+### Atlas Overview
+- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.
+- Control plane workload count flags any non-system pods that slipped onto the HA nodes.
+- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."""
+            ),
+            {"h": 5, "w": 24, "x": 0, "y": 55},
         )
     )
 
     return {
         "uid": "atlas-overview",
         "title": "Atlas Overview",
-        "annotations": {
-            "list": [
-                {
-                    "builtIn": 1,
-                    "datasource": {"type": "datasource", "uid": "grafana"},
-                    "enable": True,
-                    "hide": True,
-                    "iconColor": "rgba(0, 211, 255, 1)",
-                    "name": "Annotations & Alerts",
-                    "type": "dashboard",
-                }
-            ]
-        },
+        "folderUid": PUBLIC_FOLDER,
         "editable": False,
-        "folderUid": "atlas-overview",
-        "graphTooltip": 0,
-        "links": [
-            {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
-            {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
-            {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
-        ],
+        "annotations": {"list": []},
         "panels": panels,
-        "refresh": "30s",
         "schemaVersion": 39,
         "style": "dark",
         "tags": ["atlas", "overview"],
         "templating": {"list": []},
         "time": {"from": "now-12h", "to": "now"},
+        "links": [
+            {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
+            {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
+            {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
+            {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
+        ],
     }
 
 
 def build_pods_dashboard():
     panels = []
     panels.append(
-        table_panel(
+        stat_panel(
             1,
-            "Pods not running",
-            "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
-            {"h": 10, "w": 24, "x": 0, "y": 0},
-            unit="s",
-            transformations=[{"id": "labelsToFields", "options": {}}],
+            "Problem pods",
+            PROBLEM_PODS_EXPR,
+            {"h": 4, "w": 6, "x": 0, "y": 0},
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "red", "value": 1},
+                ],
+            },
         )
     )
     panels.append(
-        table_panel(
+        stat_panel(
             2,
             "CrashLoop / ImagePull",
-            "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
-            {"h": 10, "w": 24, "x": 0, "y": 10},
+            CRASHLOOP_EXPR,
+            {"h": 4, "w": 6, "x": 6, "y": 0},
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "red", "value": 1},
+                ],
+            },
+        )
+    )
+    panels.append(
+        stat_panel(
+            3,
+            "Stuck terminating (>10m)",
+            STUCK_TERMINATING_EXPR,
+            {"h": 4, "w": 6, "x": 12, "y": 0},
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "red", "value": 1},
+                ],
+            },
+        )
+    )
+    panels.append(
+        stat_panel(
+            4,
+            "Control plane workloads",
+            f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
+            {"h": 4, "w": 6, "x": 18, "y": 0},
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "red", "value": 1},
+                ],
+            },
+        )
+    )
+
+    panels.append(
+        table_panel(
+            5,
+            "Pods not running",
+            PROBLEM_TABLE_EXPR,
+            {"h": 10, "w": 24, "x": 0, "y": 4},
             unit="s",
             transformations=[{"id": "labelsToFields", "options": {}}],
         )
     )
     panels.append(
         table_panel(
-            3,
-            "Terminating pods",
-            "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
-            {"h": 10, "w": 24, "x": 0, "y": 20},
+            6,
+            "CrashLoop / ImagePull",
+            CRASHLOOP_TABLE_EXPR,
+            {"h": 10, "w": 24, "x": 0, "y": 14},
+            unit="s",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        table_panel(
+            7,
+            "Terminating >10m",
+            STUCK_TABLE_EXPR,
+            {"h": 10, "w": 24, "x": 0, "y": 24},
             unit="s",
             transformations=[
-                {"id": "labelsToFields", "options": {}} ,
+                {"id": "labelsToFields", "options": {}},
                 {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
             ],
         )
@@ -490,7 +767,7 @@ def build_pods_dashboard():
     return {
         "uid": "atlas-pods",
         "title": "Atlas Pods",
-        "folderUid": "atlas-pods",
+        "folderUid": PRIVATE_FOLDER,
         "editable": True,
         "panels": panels,
         "time": {"from": "now-12h", "to": "now"},
@@ -503,17 +780,99 @@ def build_pods_dashboard():
 
 def build_nodes_dashboard():
     panels = []
-    panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0}))
-    panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0}))
-    panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
-    panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
-    panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
-    panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
-    panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d"))
+    panels.append(
+        stat_panel(
+            1,
+            "Worker nodes ready",
+            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
+            {"h": 4, "w": 8, "x": 0, "y": 0},
+            value_suffix=WORKER_SUFFIX,
+        )
+    )
+    panels.append(
+        stat_panel(
+            2,
+            "Control plane ready",
+            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
+            {"h": 4, "w": 8, "x": 8, "y": 0},
+            value_suffix=CONTROL_SUFFIX,
+        )
+    )
+    panels.append(
+        stat_panel(
+            3,
+            "Control plane workloads",
+            f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
+            {"h": 4, "w": 8, "x": 16, "y": 0},
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            4,
+            "Node CPU",
+            node_cpu_expr(),
+            {"h": 9, "w": 24, "x": 0, "y": 4},
+            unit="percent",
+            legend="{{node}}",
+            legend_calcs=["last"],
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            5,
+            "Node RAM",
+            node_mem_expr(),
+            {"h": 9, "w": 24, "x": 0, "y": 13},
+            unit="percent",
+            legend="{{node}}",
+            legend_calcs=["last"],
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            6,
+            "Control plane (incl. titan-db) CPU",
+            node_cpu_expr(CONTROL_ALL_REGEX),
+            {"h": 9, "w": 12, "x": 0, "y": 22},
+            unit="percent",
+            legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            7,
+            "Control plane (incl. titan-db) RAM",
+            node_mem_expr(CONTROL_ALL_REGEX),
+            {"h": 9, "w": 12, "x": 12, "y": 22},
+            unit="percent",
+            legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            8,
+            "Root filesystem usage",
+            root_usage_expr(),
+            {"h": 9, "w": 24, "x": 0, "y": 31},
+            unit="percent",
+            legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
+            time_from="30d",
+        )
+    )
     return {
         "uid": "atlas-nodes",
         "title": "Atlas Nodes",
-        "folderUid": "atlas-nodes",
+        "folderUid": PRIVATE_FOLDER,
         "editable": True,
         "panels": panels,
         "time": {"from": "now-12h", "to": "now"},
@@ -526,17 +885,94 @@ def build_nodes_dashboard():
 
 def build_storage_dashboard():
     panels = []
-    panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent"))
-    panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent"))
-    panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI"))
-    panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI"))
-    panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d"))
-    panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
-    panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
+    panels.append(
+        stat_panel(
+            1,
+            "Astreae usage",
+            astreae_usage_expr("/mnt/astreae"),
+            {"h": 5, "w": 6, "x": 0, "y": 0},
+            unit="percent",
+            thresholds=PERCENT_THRESHOLDS,
+        )
+    )
+    panels.append(
+        stat_panel(
+            2,
+            "Asteria usage",
+            astreae_usage_expr("/mnt/asteria"),
+            {"h": 5, "w": 6, "x": 6, "y": 0},
+            unit="percent",
+            thresholds=PERCENT_THRESHOLDS,
+        )
+    )
+    panels.append(
+        stat_panel(
+            3,
+            "Astreae free",
+            astreae_free_expr("/mnt/astreae"),
+            {"h": 5, "w": 6, "x": 12, "y": 0},
+            unit="bytesSI",
+        )
+    )
+    panels.append(
+        stat_panel(
+            4,
+            "Asteria free",
+            astreae_free_expr("/mnt/asteria"),
+            {"h": 5, "w": 6, "x": 18, "y": 0},
+            unit="bytesSI",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            5,
+            "Astreae per-node usage",
+            filesystem_usage_expr("/mnt/astreae"),
+            {"h": 9, "w": 12, "x": 0, "y": 5},
+            unit="percent",
+            legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
+            time_from="30d",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            6,
+            "Asteria per-node usage",
+            filesystem_usage_expr("/mnt/asteria"),
+            {"h": 9, "w": 12, "x": 12, "y": 5},
+            unit="percent",
+            legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
+            time_from="30d",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            7,
+            "Astreae usage history",
+            astreae_usage_expr("/mnt/astreae"),
+            {"h": 9, "w": 12, "x": 0, "y": 14},
+            unit="percent",
+            time_from="90d",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            8,
+            "Asteria usage history",
+            astreae_usage_expr("/mnt/asteria"),
+            {"h": 9, "w": 12, "x": 12, "y": 14},
+            unit="percent",
+            time_from="90d",
+        )
+    )
     return {
         "uid": "atlas-storage",
         "title": "Atlas Storage",
-        "folderUid": "atlas-storage",
+        "folderUid": PRIVATE_FOLDER,
         "editable": True,
         "panels": panels,
         "time": {"from": "now-12h", "to": "now"},
@@ -547,6 +983,95 @@ def build_storage_dashboard():
     }
 
 
+def build_network_dashboard():
+    panels = []
+    panels.append(
+        stat_panel(1, "Ingress bytes/s", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="bytes/sec")
+    )
+    panels.append(
+        stat_panel(2, "Egress bytes/s", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="bytes/sec")
+    )
+    panels.append(
+        stat_panel(
+            3,
+            "Top router req/s",
+            'max(topk(1, rate(traefik_router_requests_total[5m])))',
+            {"h": 4, "w": 8, "x": 16, "y": 0},
+            unit="req/s",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            4,
+            "Per-node throughput",
+            NET_SERIES_EXPR,
+            {"h": 8, "w": 24, "x": 0, "y": 4},
+            unit="bytes/sec",
+            legend="{{node}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        table_panel(
+            5,
+            "Top namespaces",
+            'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
+            '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
+            {"h": 9, "w": 12, "x": 0, "y": 12},
+            unit="bytes/sec",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        table_panel(
+            6,
+            "Top pods",
+            'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
+            '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
+            {"h": 9, "w": 12, "x": 12, "y": 12},
+            unit="bytes/sec",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            7,
+            "Traefik routers (req/s)",
+            'topk(10, rate(traefik_router_requests_total[5m]))',
+            {"h": 9, "w": 12, "x": 0, "y": 21},
+            unit="req/s",
+            legend="{{router}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            8,
+            "Traefik entrypoints (req/s)",
+            'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
+            {"h": 9, "w": 12, "x": 12, "y": 21},
+            unit="req/s",
+            legend="{{entrypoint}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    return {
+        "uid": "atlas-network",
+        "title": "Atlas Network",
+        "folderUid": PRIVATE_FOLDER,
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-12h", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "network"],
+    }
+
+
 DASHBOARDS = {
     "atlas-overview": {
         "builder": build_overview,
@@ -564,20 +1089,24 @@ DASHBOARDS = {
         "builder": build_storage_dashboard,
         "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
     },
+    "atlas-network": {
+        "builder": build_network_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
+    },
 }
 
 
-def write_json(uid: str, data: dict) -> None:
+def write_json(uid, data):
     DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
     path = DASHBOARD_DIR / f"{uid}.json"
     path.write_text(json.dumps(data, indent=2) + "\n")
 
 
-def render_configmap(uid: str, data: dict) -> None:
+def render_configmap(uid, info):
     json_path = DASHBOARD_DIR / f"{uid}.json"
     payload = json.dumps(json.loads(json_path.read_text()), indent=2)
     indented = "\n".join("    " + line for line in payload.splitlines())
-    output_path = data["configmap"]
+    output_path = info["configmap"]
     content = CONFIG_TEMPLATE.format(
         relative_path=output_path.relative_to(ROOT),
         name=output_path.stem,
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
new file mode 100644
index 0000000..3846d2a
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -0,0 +1,384 @@
+{
+  "uid": "atlas-network",
+  "title": "Atlas Network",
+  "folderUid": "atlas-internal",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Ingress bytes/s",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "bytes/sec",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Egress bytes/s",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 8,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "bytes/sec",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Top router req/s",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 16,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "req/s",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 4,
+      "type": "timeseries",
+      "title": "Per-node throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 4
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes/sec"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 5,
+      "type": "table",
+      "title": "Top namespaces",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes/sec"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "type": "table",
+      "title": "Top pods",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes/sec"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "type": "timeseries",
+      "title": "Traefik routers (req/s)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 21
+      },
+      "targets": [
+        {
+          "expr": "topk(10, rate(traefik_router_requests_total[5m]))",
+          "refId": "A",
+          "legendFormat": "{{router}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "req/s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 8,
+      "type": "timeseries",
+      "title": "Traefik entrypoints (req/s)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 21
+      },
+      "targets": [
+        {
+          "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))",
+          "refId": "A",
+          "legendFormat": "{{entrypoint}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "req/s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    }
+  ],
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "network"
+  ]
+}
diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json
index d3393a9..e974d8a 100644
--- a/services/monitoring/dashboards/atlas-nodes.json
+++ b/services/monitoring/dashboards/atlas-nodes.json
@@ -1,26 +1,26 @@
 {
   "uid": "atlas-nodes",
   "title": "Atlas Nodes",
-  "folderUid": "atlas-nodes",
+  "folderUid": "atlas-internal",
   "editable": true,
   "panels": [
     {
       "id": 1,
       "type": "stat",
-      "title": "Node count",
+      "title": "Worker nodes ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
-        "w": 6,
+        "h": 4,
+        "w": 8,
         "x": 0,
         "y": 0
       },
       "targets": [
         {
-          "expr": "count(kube_node_info)",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
           "refId": "A"
         }
       ],
@@ -43,7 +43,11 @@
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto",
+            "valueSuffix": "/18"
+          }
         },
         "overrides": []
       },
@@ -64,20 +68,20 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Ready nodes",
+      "title": "Control plane ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
-        "w": 6,
-        "x": 6,
+        "h": 4,
+        "w": 8,
+        "x": 8,
         "y": 0
       },
       "targets": [
         {
-          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
           "refId": "A"
         }
       ],
@@ -100,7 +104,11 @@
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto",
+            "valueSuffix": "/3"
+          }
         },
         "overrides": []
       },
@@ -121,22 +129,21 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Control plane CPU avg",
+      "title": "Control plane workloads",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
-        "w": 6,
-        "x": 12,
+        "h": 4,
+        "w": 8,
+        "x": 16,
         "y": 0
       },
       "targets": [
         {
-          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
-          "refId": "A",
-          "legendFormat": "{{node}}"
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
@@ -158,7 +165,10 @@
               }
             ]
           },
-          "unit": "percent"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -173,69 +183,11 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value_and_name"
+        "textMode": "value"
       }
     },
     {
       "id": 4,
-      "type": "stat",
-      "title": "Control plane RAM avg",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 5,
-        "w": 6,
-        "x": 18,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value_and_name"
-      }
-    },
-    {
-      "id": 5,
       "type": "timeseries",
       "title": "Node CPU",
       "datasource": {
@@ -246,11 +198,51 @@
         "h": 9,
         "w": 24,
         "x": 0,
-        "y": 5
+        "y": 4
       },
       "targets": [
         {
-          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 5,
+      "type": "timeseries",
+      "title": "Node RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 24,
+        "x": 0,
+        "y": 13
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -277,20 +269,20 @@
     {
       "id": 6,
       "type": "timeseries",
-      "title": "Node RAM",
+      "title": "Control plane (incl. titan-db) CPU",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 9,
-        "w": 24,
+        "w": 12,
         "x": 0,
-        "y": 14
+        "y": 22
       },
       "targets": [
         {
-          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -304,10 +296,7 @@
       "options": {
         "legend": {
           "displayMode": "table",
-          "placement": "right",
-          "calcs": [
-            "last"
-          ]
+          "placement": "right"
         },
         "tooltip": {
           "mode": "multi"
@@ -317,7 +306,44 @@
     {
       "id": 7,
       "type": "timeseries",
-      "title": "Root filesystem",
+      "title": "Control plane (incl. titan-db) RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 22
+      },
+      "targets": [
+        {
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 8,
+      "type": "timeseries",
+      "title": "Root filesystem usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -326,11 +352,11 @@
         "h": 9,
         "w": 24,
         "x": 0,
-        "y": 23
+        "y": 31
       },
       "targets": [
         {
-          "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -350,7 +376,7 @@
           "mode": "multi"
         }
       },
-      "timeFrom": "7d"
+      "timeFrom": "30d"
     }
   ],
   "time": {
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index d7a0d27..3377a13 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1,45 +1,11 @@
 {
   "uid": "atlas-overview",
   "title": "Atlas Overview",
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": {
-          "type": "datasource",
-          "uid": "grafana"
-        },
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "editable": false,
   "folderUid": "atlas-overview",
-  "graphTooltip": 0,
-  "links": [
-    {
-      "title": "Pods dashboard",
-      "type": "dashboard",
-      "dashboardUid": "atlas-pods",
-      "keepTime": false
-    },
-    {
-      "title": "Nodes dashboard",
-      "type": "dashboard",
-      "dashboardUid": "atlas-nodes",
-      "keepTime": false
-    },
-    {
-      "title": "Storage dashboard",
-      "type": "dashboard",
-      "dashboardUid": "atlas-storage",
-      "keepTime": false
-    }
-  ],
+  "editable": false,
+  "annotations": {
+    "list": []
+  },
   "panels": [
     {
       "id": 1,
@@ -80,7 +46,10 @@
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -114,7 +83,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
           "refId": "A"
         }
       ],
@@ -128,16 +97,20 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "red",
                 "value": null
               },
               {
                 "color": "green",
-                "value": 1
+                "value": 18
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto",
+            "valueSuffix": "/18"
+          }
         },
         "overrides": []
       },
@@ -185,16 +158,20 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "red",
                 "value": null
               },
               {
                 "color": "green",
-                "value": 1
+                "value": 3
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto",
+            "valueSuffix": "/3"
+          }
         },
         "overrides": []
       },
@@ -215,7 +192,7 @@
     {
       "id": 4,
       "type": "stat",
-      "title": "Control plane schedulable",
+      "title": "Control plane workloads",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -228,7 +205,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)",
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
           "refId": "A"
         }
       ],
@@ -242,16 +219,19 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "green",
                 "value": null
               },
               {
-                "color": "green",
+                "color": "red",
                 "value": 1
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -267,7 +247,14 @@
           "values": false
         },
         "textMode": "value"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-pods dashboard",
+          "url": "/d/atlas-pods",
+          "targetBlank": true
+        }
+      ]
     },
     {
       "id": 5,
@@ -285,7 +272,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+          "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
           "refId": "A"
         }
       ],
@@ -299,16 +286,19 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "green",
                 "value": null
               },
               {
-                "color": "green",
+                "color": "red",
                 "value": 1
               }
             ]
           },
-          "unit": "none"
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -324,7 +314,14 @@
           "values": false
         },
         "textMode": "value"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-pods dashboard",
+          "url": "/d/atlas-pods",
+          "targetBlank": true
+        }
+      ]
     },
     {
       "id": 6,
@@ -342,10 +339,222 @@
       },
       "targets": [
         {
-          "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))",
+          "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
           "refId": "A"
         }
       ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      },
+      "links": [
+        {
+          "title": "Open atlas-pods dashboard",
+          "url": "/d/atlas-pods",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "type": "stat",
+      "title": "Hottest node: CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 0,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value_and_name"
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "type": "stat",
+      "title": "Hottest node: RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 6,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value_and_name"
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 9,
+      "type": "stat",
+      "title": "Hottest node: NET",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 12,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -365,69 +574,10 @@
               }
             ]
           },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 7,
-      "type": "stat",
-      "title": "Hottest node: CPU",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 5,
-        "w": 4,
-        "x": 24,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "percentage",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 70
-              },
-              {
-                "color": "red",
-                "value": 85
-              }
-            ]
-          },
-          "unit": "percent"
+          "unit": "bytes/sec",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -443,25 +593,32 @@
           "values": false
         },
         "textMode": "value_and_name"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 8,
+      "id": 10,
       "type": "stat",
-      "title": "Hottest node: RAM",
+      "title": "Hottest node: I/O",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 5,
-        "w": 4,
-        "x": 28,
-        "y": 0
+        "w": 6,
+        "x": 18,
+        "y": 5
       },
       "targets": [
         {
-          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -473,23 +630,22 @@
           },
           "mappings": [],
           "thresholds": {
-            "mode": "percentage",
+            "mode": "absolute",
             "steps": [
               {
-                "color": "green",
+                "color": "rgba(115, 115, 115, 1)",
                 "value": null
               },
               {
-                "color": "yellow",
-                "value": 70
-              },
-              {
-                "color": "red",
-                "value": 85
+                "color": "green",
+                "value": 1
               }
             ]
           },
-          "unit": "percent"
+          "unit": "bytes/sec",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -505,10 +661,17 @@
           "values": false
         },
         "textMode": "value_and_name"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 9,
+      "id": 11,
       "type": "piechart",
       "title": "Namespace CPU share",
       "datasource": {
@@ -519,18 +682,18 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 5
+        "y": 10
       },
       "targets": [
         {
-          "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
-          "refId": "A",
-          "legendFormat": "{{namespace}}"
+          "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent"
+          "unit": "percent",
+          "displayName": "{{namespace}}"
         },
         "overrides": []
       },
@@ -550,7 +713,7 @@
       }
     },
     {
-      "id": 10,
+      "id": 12,
       "type": "piechart",
       "title": "Namespace RAM share",
       "datasource": {
@@ -561,18 +724,18 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 5
+        "y": 10
       },
       "targets": [
         {
-          "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
-          "refId": "A",
-          "legendFormat": "{{namespace}}"
+          "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent"
+          "unit": "percent",
+          "displayName": "{{namespace}}"
         },
         "overrides": []
       },
@@ -592,7 +755,7 @@
       }
     },
     {
-      "id": 11,
+      "id": 13,
       "type": "timeseries",
       "title": "Cluster node CPU",
       "datasource": {
@@ -603,11 +766,11 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 14
+        "y": 19
       },
       "targets": [
         {
-          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -629,10 +792,17 @@
         "tooltip": {
           "mode": "multi"
         }
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 12,
+      "id": 14,
       "type": "timeseries",
       "title": "Cluster node RAM",
       "datasource": {
@@ -643,11 +813,11 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 14
+        "y": 19
       },
       "targets": [
         {
-          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+          "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -669,92 +839,19 @@
         "tooltip": {
           "mode": "multi"
         }
-      }
-    },
-    {
-      "id": 13,
-      "type": "table",
-      "title": "Problem pods (details)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
       },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 22
-      },
-      "targets": [
+      "links": [
         {
-          "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        }
-      ]
-    },
-    {
-      "id": 14,
-      "type": "table",
-      "title": "Terminating >10m",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 22
-      },
-      "targets": [
-        {
-          "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "filterByValue",
-          "options": {
-            "match": "Value",
-            "operator": "gt",
-            "value": 600
-          }
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
         }
       ]
     },
     {
       "id": 15,
       "type": "timeseries",
-      "title": "Control plane CPU",
+      "title": "Control plane CPU (incl. titan-db)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -763,11 +860,11 @@
         "h": 7,
         "w": 12,
         "x": 0,
-        "y": 30
+        "y": 27
       },
       "targets": [
         {
-          "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -781,7 +878,7 @@
       "options": {
         "legend": {
           "displayMode": "table",
-          "placement": "bottom"
+          "placement": "right"
         },
         "tooltip": {
           "mode": "multi"
@@ -791,7 +888,7 @@
     {
       "id": 16,
       "type": "timeseries",
-      "title": "Control plane RAM",
+      "title": "Control plane RAM (incl. titan-db)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -800,11 +897,11 @@
         "h": 7,
         "w": 12,
         "x": 12,
-        "y": 30
+        "y": 27
       },
       "targets": [
         {
-          "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -818,7 +915,7 @@
       "options": {
         "legend": {
           "displayMode": "table",
-          "placement": "bottom"
+          "placement": "right"
         },
         "tooltip": {
           "mode": "multi"
@@ -828,6 +925,92 @@
     {
       "id": 17,
       "type": "timeseries",
+      "title": "Cluster ingress throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 34
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes/sec"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-network dashboard",
+          "url": "/d/atlas-network",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 18,
+      "type": "timeseries",
+      "title": "Cluster egress throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 34
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "bytes/sec"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-network dashboard",
+          "url": "/d/atlas-network",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 19,
+      "type": "timeseries",
       "title": "Root filesystem usage",
       "datasource": {
         "type": "prometheus",
@@ -837,11 +1020,11 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 37
+        "y": 41
       },
       "targets": [
         {
-          "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -864,10 +1047,17 @@
           "mode": "multi"
         }
       },
-      "timeFrom": "7d"
+      "timeFrom": "30d",
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 18,
+      "id": 20,
       "type": "bargauge",
       "title": "Nodes closest to full root disks",
       "datasource": {
@@ -878,13 +1068,12 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 37
+        "y": 41
       },
       "targets": [
         {
-          "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
+          "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
@@ -893,7 +1082,7 @@
           "min": 0,
           "max": 100,
           "thresholds": {
-            "mode": "percentage",
+            "mode": "absolute",
             "steps": [
               {
                 "color": "green",
@@ -912,7 +1101,8 @@
                 "value": 85
               }
             ]
-          }
+          },
+          "displayName": "{{node}}"
         },
         "overrides": []
       },
@@ -926,10 +1116,17 @@
           "fields": "",
           "values": false
         }
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 19,
+      "id": 21,
       "type": "stat",
       "title": "Astreae usage",
       "datasource": {
@@ -940,7 +1137,7 @@
         "h": 6,
         "w": 6,
         "x": 0,
-        "y": 45
+        "y": 49
       },
       "targets": [
         {
@@ -971,7 +1168,10 @@
               }
             ]
           },
-          "unit": "percent"
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -987,10 +1187,17 @@
           "values": false
         },
         "textMode": "value"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 20,
+      "id": 22,
       "type": "stat",
       "title": "Asteria usage",
       "datasource": {
@@ -1001,7 +1208,7 @@
         "h": 6,
         "w": 6,
         "x": 6,
-        "y": 45
+        "y": 49
       },
       "targets": [
         {
@@ -1032,7 +1239,10 @@
               }
             ]
           },
-          "unit": "percent"
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -1048,10 +1258,17 @@
           "values": false
         },
         "textMode": "value"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 21,
+      "id": 23,
       "type": "stat",
       "title": "Astreae free",
       "datasource": {
@@ -1062,7 +1279,7 @@
         "h": 6,
         "w": 6,
         "x": 12,
-        "y": 45
+        "y": 49
       },
       "targets": [
         {
@@ -1089,7 +1306,10 @@
               }
             ]
           },
-          "unit": "bytesSI"
+          "unit": "bytesSI",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -1105,10 +1325,17 @@
           "values": false
         },
         "textMode": "value"
-      }
+      },
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ]
     },
     {
-      "id": 22,
+      "id": 24,
       "type": "stat",
       "title": "Asteria free",
       "datasource": {
@@ -1119,7 +1346,7 @@
         "h": 6,
         "w": 6,
         "x": 18,
-        "y": 45
+        "y": 49
       },
       "targets": [
         {
@@ -1146,7 +1373,10 @@
               }
             ]
           },
-          "unit": "bytesSI"
+          "unit": "bytesSI",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -1162,77 +1392,12 @@
           "values": false
         },
         "textMode": "value"
-      }
-    },
-    {
-      "id": 23,
-      "type": "table",
-      "title": "Astreae per-node usage",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
       },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 51
-      },
-      "targets": [
+      "links": [
         {
-          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        }
-      ]
-    },
-    {
-      "id": 24,
-      "type": "table",
-      "title": "Asteria per-node usage",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 51
-      },
-      "targets": [
-        {
-          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
         }
       ]
     },
@@ -1244,16 +1409,15 @@
         "h": 5,
         "w": 24,
         "x": 0,
-        "y": 59
+        "y": 55
       },
       "datasource": null,
       "options": {
         "mode": "markdown",
-        "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders"
+        "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
       }
     }
   ],
-  "refresh": "30s",
   "schemaVersion": 39,
   "style": "dark",
   "tags": [
@@ -1266,5 +1430,31 @@
   "time": {
     "from": "now-12h",
     "to": "now"
-  }
+  },
+  "links": [
+    {
+      "title": "Atlas Pods",
+      "type": "dashboard",
+      "dashboardUid": "atlas-pods",
+      "keepTime": false
+    },
+    {
+      "title": "Atlas Nodes",
+      "type": "dashboard",
+      "dashboardUid": "atlas-nodes",
+      "keepTime": false
+    },
+    {
+      "title": "Atlas Storage",
+      "type": "dashboard",
+      "dashboardUid": "atlas-storage",
+      "keepTime": false
+    },
+    {
+      "title": "Atlas Network",
+      "type": "dashboard",
+      "dashboardUid": "atlas-network",
+      "keepTime": false
+    }
+  ]
 }
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index 91f80eb..3e7dd0e 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -1,11 +1,251 @@
 {
   "uid": "atlas-pods",
   "title": "Atlas Pods",
-  "folderUid": "atlas-pods",
+  "folderUid": "atlas-internal",
   "editable": true,
   "panels": [
     {
       "id": 1,
+      "type": "stat",
+      "title": "Problem pods",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "CrashLoop / ImagePull",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Stuck terminating (>10m)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Control plane workloads",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 5,
       "type": "table",
       "title": "Pods not running",
       "datasource": {
@@ -16,11 +256,11 @@
         "h": 10,
         "w": 24,
         "x": 0,
-        "y": 0
+        "y": 4
       },
       "targets": [
         {
-          "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+          "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
           "refId": "A"
         }
       ],
@@ -41,7 +281,7 @@
       ]
     },
     {
-      "id": 2,
+      "id": 6,
       "type": "table",
       "title": "CrashLoop / ImagePull",
       "datasource": {
@@ -52,11 +292,11 @@
         "h": 10,
         "w": 24,
         "x": 0,
-        "y": 10
+        "y": 14
       },
       "targets": [
         {
-          "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
+          "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
           "refId": "A"
         }
       ],
@@ -77,9 +317,9 @@
       ]
     },
     {
-      "id": 3,
+      "id": 7,
       "type": "table",
-      "title": "Terminating pods",
+      "title": "Terminating >10m",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -88,11 +328,11 @@
         "h": 10,
         "w": 24,
         "x": 0,
-        "y": 20
+        "y": 24
       },
       "targets": [
         {
-          "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+          "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json
index aa1948d..bb7d152 100644
--- a/services/monitoring/dashboards/atlas-storage.json
+++ b/services/monitoring/dashboards/atlas-storage.json
@@ -1,7 +1,7 @@
 {
   "uid": "atlas-storage",
   "title": "Atlas Storage",
-  "folderUid": "atlas-storage",
+  "folderUid": "atlas-internal",
   "editable": true,
   "panels": [
     {
@@ -31,19 +31,26 @@
           },
           "mappings": [],
           "thresholds": {
-            "mode": "absolute",
+            "mode": "percentage",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "green",
                 "value": null
               },
               {
-                "color": "green",
-                "value": 1
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
               }
             ]
           },
-          "unit": "percent"
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -88,19 +95,26 @@
           },
           "mappings": [],
           "thresholds": {
-            "mode": "absolute",
+            "mode": "percentage",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "green",
                 "value": null
               },
               {
-                "color": "green",
-                "value": 1
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
               }
             ]
           },
-          "unit": "percent"
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -157,7 +171,10 @@
               }
             ]
           },
-          "unit": "bytesSI"
+          "unit": "bytesSI",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -214,7 +231,10 @@
               }
             ]
           },
-          "unit": "bytesSI"
+          "unit": "bytesSI",
+          "custom": {
+            "displayMode": "auto"
+          }
         },
         "overrides": []
       },
@@ -235,20 +255,20 @@
     {
       "id": 5,
       "type": "timeseries",
-      "title": "Root filesystem",
+      "title": "Astreae per-node usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 9,
-        "w": 24,
+        "w": 12,
         "x": 0,
         "y": 5
       },
       "targets": [
         {
-          "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -272,21 +292,59 @@
     },
     {
       "id": 6,
-      "type": "table",
-      "title": "Astreae nodes",
+      "type": "timeseries",
+      "title": "Asteria per-node usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 10,
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "timeFrom": "30d"
+    },
+    {
+      "id": 7,
+      "type": "timeseries",
+      "title": "Astreae usage history",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
         "w": 12,
         "x": 0,
         "y": 14
       },
       "targets": [
         {
-          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
+          "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
           "refId": "A"
         }
       ],
@@ -297,32 +355,33 @@
         "overrides": []
       },
       "options": {
-        "showHeader": true
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
         }
-      ]
+      },
+      "timeFrom": "90d"
     },
     {
-      "id": 7,
-      "type": "table",
-      "title": "Asteria nodes",
+      "id": 8,
+      "type": "timeseries",
+      "title": "Asteria usage history",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 10,
+        "h": 9,
         "w": 12,
         "x": 12,
         "y": 14
       },
       "targets": [
         {
-          "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
+          "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
           "refId": "A"
         }
       ],
@@ -333,14 +392,15 @@
         "overrides": []
       },
       "options": {
-        "showHeader": true
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
         }
-      ]
+      },
+      "timeFrom": "90d"
     }
   ],
   "time": {
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
new file mode 100644
index 0000000..e1ba054
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -0,0 +1,393 @@
+# services/monitoring/grafana-dashboard-network.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-network
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-network.json: |
+    {
+      "uid": "atlas-network",
+      "title": "Atlas Network",
+      "folderUid": "atlas-internal",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "stat",
+          "title": "Ingress bytes/s",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 8,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "bytes/sec",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 2,
+          "type": "stat",
+          "title": "Egress bytes/s",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 8,
+            "x": 8,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "bytes/sec",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 3,
+          "type": "stat",
+          "title": "Top router req/s",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 8,
+            "x": 16,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "req/s",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 4,
+          "type": "timeseries",
+          "title": "Per-node throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 24,
+            "x": 0,
+            "y": 4
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes/sec"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 5,
+          "type": "table",
+          "title": "Top namespaces",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 12
+          },
+          "targets": [
+            {
+              "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes/sec"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        },
+        {
+          "id": 6,
+          "type": "table",
+          "title": "Top pods",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 12
+          },
+          "targets": [
+            {
+              "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes/sec"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        },
+        {
+          "id": 7,
+          "type": "timeseries",
+          "title": "Traefik routers (req/s)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 21
+          },
+          "targets": [
+            {
+              "expr": "topk(10, rate(traefik_router_requests_total[5m]))",
+              "refId": "A",
+              "legendFormat": "{{router}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "req/s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 8,
+          "type": "timeseries",
+          "title": "Traefik entrypoints (req/s)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 21
+          },
+          "targets": [
+            {
+              "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))",
+              "refId": "A",
+              "legendFormat": "{{entrypoint}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "req/s"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        }
+      ],
+      "time": {
+        "from": "now-12h",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "network"
+      ]
+    }
diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml
index 516f207..afbeb3c 100644
--- a/services/monitoring/grafana-dashboard-nodes.yaml
+++ b/services/monitoring/grafana-dashboard-nodes.yaml
@@ -10,26 +10,26 @@ data:
     {
       "uid": "atlas-nodes",
       "title": "Atlas Nodes",
-      "folderUid": "atlas-nodes",
+      "folderUid": "atlas-internal",
       "editable": true,
       "panels": [
         {
           "id": 1,
           "type": "stat",
-          "title": "Node count",
+          "title": "Worker nodes ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
-            "w": 6,
+            "h": 4,
+            "w": 8,
             "x": 0,
             "y": 0
           },
           "targets": [
             {
-              "expr": "count(kube_node_info)",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
               "refId": "A"
             }
           ],
@@ -52,7 +52,11 @@ data:
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto",
+                "valueSuffix": "/18"
+              }
             },
             "overrides": []
           },
@@ -73,20 +77,20 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Ready nodes",
+          "title": "Control plane ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
-            "w": 6,
-            "x": 6,
+            "h": 4,
+            "w": 8,
+            "x": 8,
             "y": 0
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})",
               "refId": "A"
             }
           ],
@@ -109,7 +113,11 @@ data:
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto",
+                "valueSuffix": "/3"
+              }
             },
             "overrides": []
           },
@@ -130,22 +138,21 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Control plane CPU avg",
+          "title": "Control plane workloads",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
-            "w": 6,
-            "x": 12,
+            "h": 4,
+            "w": 8,
+            "x": 16,
             "y": 0
           },
           "targets": [
             {
-              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
-              "refId": "A",
-              "legendFormat": "{{node}}"
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
@@ -167,7 +174,10 @@ data:
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -182,69 +192,11 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value_and_name"
+            "textMode": "value"
           }
         },
         {
           "id": 4,
-          "type": "stat",
-          "title": "Control plane RAM avg",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 5,
-            "w": 6,
-            "x": 18,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value_and_name"
-          }
-        },
-        {
-          "id": 5,
           "type": "timeseries",
           "title": "Node CPU",
           "datasource": {
@@ -255,11 +207,51 @@ data:
             "h": 9,
             "w": 24,
             "x": 0,
-            "y": 5
+            "y": 4
           },
           "targets": [
             {
-              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 5,
+          "type": "timeseries",
+          "title": "Node RAM",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 24,
+            "x": 0,
+            "y": 13
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -286,20 +278,20 @@ data:
         {
           "id": 6,
           "type": "timeseries",
-          "title": "Node RAM",
+          "title": "Control plane (incl. titan-db) CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 24,
+            "w": 12,
             "x": 0,
-            "y": 14
+            "y": 22
           },
           "targets": [
             {
-              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -313,10 +305,7 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "right",
-              "calcs": [
-                "last"
-              ]
+              "placement": "right"
             },
             "tooltip": {
               "mode": "multi"
@@ -326,7 +315,44 @@ data:
         {
           "id": 7,
           "type": "timeseries",
-          "title": "Root filesystem",
+          "title": "Control plane (incl. titan-db) RAM",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 22
+          },
+          "targets": [
+            {
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 8,
+          "type": "timeseries",
+          "title": "Root filesystem usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -335,11 +361,11 @@ data:
             "h": 9,
             "w": 24,
             "x": 0,
-            "y": 23
+            "y": 31
           },
           "targets": [
             {
-              "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -359,7 +385,7 @@ data:
               "mode": "multi"
             }
           },
-          "timeFrom": "7d"
+          "timeFrom": "30d"
         }
       ],
       "time": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index a20e05a..199dfb2 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -10,45 +10,11 @@ data:
     {
       "uid": "atlas-overview",
       "title": "Atlas Overview",
-      "annotations": {
-        "list": [
-          {
-            "builtIn": 1,
-            "datasource": {
-              "type": "datasource",
-              "uid": "grafana"
-            },
-            "enable": true,
-            "hide": true,
-            "iconColor": "rgba(0, 211, 255, 1)",
-            "name": "Annotations & Alerts",
-            "type": "dashboard"
-          }
-        ]
-      },
-      "editable": false,
       "folderUid": "atlas-overview",
-      "graphTooltip": 0,
-      "links": [
-        {
-          "title": "Pods dashboard",
-          "type": "dashboard",
-          "dashboardUid": "atlas-pods",
-          "keepTime": false
-        },
-        {
-          "title": "Nodes dashboard",
-          "type": "dashboard",
-          "dashboardUid": "atlas-nodes",
-          "keepTime": false
-        },
-        {
-          "title": "Storage dashboard",
-          "type": "dashboard",
-          "dashboardUid": "atlas-storage",
-          "keepTime": false
-        }
-      ],
+      "editable": false,
+      "annotations": {
+        "list": []
+      },
       "panels": [
         {
           "id": 1,
@@ -89,7 +55,10 @@ data:
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -123,7 +92,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
               "refId": "A"
             }
           ],
@@ -137,16 +106,20 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "red",
                     "value": null
                   },
                   {
                     "color": "green",
-                    "value": 1
+                    "value": 18
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto",
+                "valueSuffix": "/18"
+              }
             },
             "overrides": []
           },
@@ -194,16 +167,20 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "red",
                     "value": null
                   },
                   {
                     "color": "green",
-                    "value": 1
+                    "value": 3
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto",
+                "valueSuffix": "/3"
+              }
             },
             "overrides": []
           },
@@ -224,7 +201,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Control plane schedulable",
+          "title": "Control plane workloads",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -237,7 +214,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)",
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
               "refId": "A"
             }
           ],
@@ -251,16 +228,19 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "green",
                     "value": null
                   },
                   {
-                    "color": "green",
+                    "color": "red",
                     "value": 1
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -276,7 +256,14 @@ data:
               "values": false
             },
             "textMode": "value"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-pods dashboard",
+              "url": "/d/atlas-pods",
+              "targetBlank": true
+            }
+          ]
         },
         {
           "id": 5,
@@ -294,7 +281,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+              "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
               "refId": "A"
             }
           ],
@@ -308,16 +295,19 @@ data:
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "green",
                     "value": null
                   },
                   {
-                    "color": "green",
+                    "color": "red",
                     "value": 1
                   }
                 ]
               },
-              "unit": "none"
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -333,7 +323,14 @@ data:
               "values": false
             },
             "textMode": "value"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-pods dashboard",
+              "url": "/d/atlas-pods",
+              "targetBlank": true
+            }
+          ]
         },
         {
           "id": 6,
@@ -351,10 +348,222 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))",
+              "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
               "refId": "A"
             }
           ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          },
+          "links": [
+            {
+              "title": "Open atlas-pods dashboard",
+              "url": "/d/atlas-pods",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 7,
+          "type": "stat",
+          "title": "Hottest node: CPU",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 0,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value_and_name"
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 8,
+          "type": "stat",
+          "title": "Hottest node: RAM",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 6,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "percentage",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value_and_name"
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 9,
+          "type": "stat",
+          "title": "Hottest node: NET",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 6,
+            "x": 12,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -374,69 +583,10 @@ data:
                   }
                 ]
               },
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 7,
-          "type": "stat",
-          "title": "Hottest node: CPU",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 5,
-            "w": 4,
-            "x": 24,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "percentage",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 70
-                  },
-                  {
-                    "color": "red",
-                    "value": 85
-                  }
-                ]
-              },
-              "unit": "percent"
+              "unit": "bytes/sec",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -452,25 +602,32 @@ data:
               "values": false
             },
             "textMode": "value_and_name"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 8,
+          "id": 10,
           "type": "stat",
-          "title": "Hottest node: RAM",
+          "title": "Hottest node: I/O",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 5,
-            "w": 4,
-            "x": 28,
-            "y": 0
+            "w": 6,
+            "x": 18,
+            "y": 5
           },
           "targets": [
             {
-              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -482,23 +639,22 @@ data:
               },
               "mappings": [],
               "thresholds": {
-                "mode": "percentage",
+                "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
                   {
-                    "color": "yellow",
-                    "value": 70
-                  },
-                  {
-                    "color": "red",
-                    "value": 85
+                    "color": "green",
+                    "value": 1
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "bytes/sec",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -514,10 +670,17 @@ data:
               "values": false
             },
             "textMode": "value_and_name"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 9,
+          "id": 11,
           "type": "piechart",
           "title": "Namespace CPU share",
           "datasource": {
@@ -528,18 +691,18 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 5
+            "y": 10
           },
           "targets": [
             {
-              "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))",
-              "refId": "A",
-              "legendFormat": "{{namespace}}"
+              "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent"
+              "unit": "percent",
+              "displayName": "{{namespace}}"
             },
             "overrides": []
           },
@@ -559,7 +722,7 @@ data:
           }
         },
         {
-          "id": 10,
+          "id": 12,
           "type": "piechart",
           "title": "Namespace RAM share",
           "datasource": {
@@ -570,18 +733,18 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 5
+            "y": 10
           },
           "targets": [
             {
-              "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))",
-              "refId": "A",
-              "legendFormat": "{{namespace}}"
+              "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent"
+              "unit": "percent",
+              "displayName": "{{namespace}}"
             },
             "overrides": []
           },
@@ -601,7 +764,7 @@ data:
           }
         },
         {
-          "id": 11,
+          "id": 13,
           "type": "timeseries",
           "title": "Cluster node CPU",
           "datasource": {
@@ -612,11 +775,11 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 14
+            "y": 19
           },
           "targets": [
             {
-              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -638,10 +801,17 @@ data:
             "tooltip": {
               "mode": "multi"
             }
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 12,
+          "id": 14,
           "type": "timeseries",
           "title": "Cluster node RAM",
           "datasource": {
@@ -652,11 +822,11 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 14
+            "y": 19
           },
           "targets": [
             {
-              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
+              "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -678,92 +848,19 @@ data:
             "tooltip": {
               "mode": "multi"
             }
-          }
-        },
-        {
-          "id": 13,
-          "type": "table",
-          "title": "Problem pods (details)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
           },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 22
-          },
-          "targets": [
+          "links": [
             {
-              "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "s"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            }
-          ]
-        },
-        {
-          "id": 14,
-          "type": "table",
-          "title": "Terminating >10m",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 22
-          },
-          "targets": [
-            {
-              "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "s"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "filterByValue",
-              "options": {
-                "match": "Value",
-                "operator": "gt",
-                "value": 600
-              }
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
             }
           ]
         },
         {
           "id": 15,
           "type": "timeseries",
-          "title": "Control plane CPU",
+          "title": "Control plane CPU (incl. titan-db)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -772,11 +869,11 @@ data:
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 30
+            "y": 27
           },
           "targets": [
             {
-              "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -790,7 +887,7 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right"
             },
             "tooltip": {
               "mode": "multi"
@@ -800,7 +897,7 @@ data:
         {
           "id": 16,
           "type": "timeseries",
-          "title": "Control plane RAM",
+          "title": "Control plane RAM (incl. titan-db)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -809,11 +906,11 @@ data:
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 30
+            "y": 27
           },
           "targets": [
             {
-              "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -827,7 +924,7 @@ data:
           "options": {
             "legend": {
               "displayMode": "table",
-              "placement": "bottom"
+              "placement": "right"
             },
             "tooltip": {
               "mode": "multi"
@@ -837,6 +934,92 @@ data:
         {
           "id": 17,
           "type": "timeseries",
+          "title": "Cluster ingress throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 34
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes/sec"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-network dashboard",
+              "url": "/d/atlas-network",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 18,
+          "type": "timeseries",
+          "title": "Cluster egress throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 34
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "bytes/sec"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-network dashboard",
+              "url": "/d/atlas-network",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 19,
+          "type": "timeseries",
           "title": "Root filesystem usage",
           "datasource": {
             "type": "prometheus",
@@ -846,11 +1029,11 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 37
+            "y": 41
           },
           "targets": [
             {
-              "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -873,10 +1056,17 @@ data:
               "mode": "multi"
             }
           },
-          "timeFrom": "7d"
+          "timeFrom": "30d",
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 18,
+          "id": 20,
           "type": "bargauge",
           "title": "Nodes closest to full root disks",
           "datasource": {
@@ -887,13 +1077,12 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 37
+            "y": 41
           },
           "targets": [
             {
-              "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
+              "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
@@ -902,7 +1091,7 @@ data:
               "min": 0,
               "max": 100,
               "thresholds": {
-                "mode": "percentage",
+                "mode": "absolute",
                 "steps": [
                   {
                     "color": "green",
@@ -921,7 +1110,8 @@ data:
                     "value": 85
                   }
                 ]
-              }
+              },
+              "displayName": "{{node}}"
             },
             "overrides": []
           },
@@ -935,10 +1125,17 @@ data:
               "fields": "",
               "values": false
             }
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 19,
+          "id": 21,
           "type": "stat",
           "title": "Astreae usage",
           "datasource": {
@@ -949,7 +1146,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 0,
-            "y": 45
+            "y": 49
           },
           "targets": [
             {
@@ -980,7 +1177,10 @@ data:
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -996,10 +1196,17 @@ data:
               "values": false
             },
             "textMode": "value"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 20,
+          "id": 22,
           "type": "stat",
           "title": "Asteria usage",
           "datasource": {
@@ -1010,7 +1217,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 6,
-            "y": 45
+            "y": 49
           },
           "targets": [
             {
@@ -1041,7 +1248,10 @@ data:
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -1057,10 +1267,17 @@ data:
               "values": false
             },
             "textMode": "value"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 21,
+          "id": 23,
           "type": "stat",
           "title": "Astreae free",
           "datasource": {
@@ -1071,7 +1288,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 12,
-            "y": 45
+            "y": 49
           },
           "targets": [
             {
@@ -1098,7 +1315,10 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI"
+              "unit": "bytesSI",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -1114,10 +1334,17 @@ data:
               "values": false
             },
             "textMode": "value"
-          }
+          },
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ]
         },
         {
-          "id": 22,
+          "id": 24,
           "type": "stat",
           "title": "Asteria free",
           "datasource": {
@@ -1128,7 +1355,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 18,
-            "y": 45
+            "y": 49
           },
           "targets": [
             {
@@ -1155,7 +1382,10 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI"
+              "unit": "bytesSI",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -1171,77 +1401,12 @@ data:
               "values": false
             },
             "textMode": "value"
-          }
-        },
-        {
-          "id": 23,
-          "type": "table",
-          "title": "Astreae per-node usage",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
           },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 51
-          },
-          "targets": [
+          "links": [
             {
-              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            }
-          ]
-        },
-        {
-          "id": 24,
-          "type": "table",
-          "title": "Asteria per-node usage",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 51
-          },
-          "targets": [
-            {
-              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
             }
           ]
         },
@@ -1253,16 +1418,15 @@ data:
             "h": 5,
             "w": 24,
             "x": 0,
-            "y": 59
+            "y": 55
           },
           "datasource": null,
           "options": {
             "mode": "markdown",
-            "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders"
+            "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
           }
         }
       ],
-      "refresh": "30s",
       "schemaVersion": 39,
       "style": "dark",
       "tags": [
@@ -1275,5 +1439,31 @@ data:
       "time": {
         "from": "now-12h",
         "to": "now"
-      }
+      },
+      "links": [
+        {
+          "title": "Atlas Pods",
+          "type": "dashboard",
+          "dashboardUid": "atlas-pods",
+          "keepTime": false
+        },
+        {
+          "title": "Atlas Nodes",
+          "type": "dashboard",
+          "dashboardUid": "atlas-nodes",
+          "keepTime": false
+        },
+        {
+          "title": "Atlas Storage",
+          "type": "dashboard",
+          "dashboardUid": "atlas-storage",
+          "keepTime": false
+        },
+        {
+          "title": "Atlas Network",
+          "type": "dashboard",
+          "dashboardUid": "atlas-network",
+          "keepTime": false
+        }
+      ]
     }
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index 3b1f5da..58cae77 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -10,11 +10,251 @@ data:
     {
       "uid": "atlas-pods",
       "title": "Atlas Pods",
-      "folderUid": "atlas-pods",
+      "folderUid": "atlas-internal",
       "editable": true,
       "panels": [
         {
           "id": 1,
+          "type": "stat",
+          "title": "Problem pods",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 2,
+          "type": "stat",
+          "title": "CrashLoop / ImagePull",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 6,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 3,
+          "type": "stat",
+          "title": "Stuck terminating (>10m)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 12,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 4,
+          "type": "stat",
+          "title": "Control plane workloads",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 18,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 5,
           "type": "table",
           "title": "Pods not running",
           "datasource": {
@@ -25,11 +265,11 @@ data:
             "h": 10,
             "w": 24,
             "x": 0,
-            "y": 0
+            "y": 4
           },
           "targets": [
             {
-              "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
+              "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
               "refId": "A"
             }
           ],
@@ -50,7 +290,7 @@ data:
           ]
         },
         {
-          "id": 2,
+          "id": 6,
           "type": "table",
           "title": "CrashLoop / ImagePull",
           "datasource": {
@@ -61,11 +301,11 @@ data:
             "h": 10,
             "w": 24,
             "x": 0,
-            "y": 10
+            "y": 14
           },
           "targets": [
             {
-              "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
+              "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
               "refId": "A"
             }
           ],
@@ -86,9 +326,9 @@ data:
           ]
         },
         {
-          "id": 3,
+          "id": 7,
           "type": "table",
-          "title": "Terminating pods",
+          "title": "Terminating >10m",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -97,11 +337,11 @@ data:
             "h": 10,
             "w": 24,
             "x": 0,
-            "y": 20
+            "y": 24
           },
           "targets": [
             {
-              "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
+              "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml
index 5b22804..99439fb 100644
--- a/services/monitoring/grafana-dashboard-storage.yaml
+++ b/services/monitoring/grafana-dashboard-storage.yaml
@@ -10,7 +10,7 @@ data:
     {
       "uid": "atlas-storage",
       "title": "Atlas Storage",
-      "folderUid": "atlas-storage",
+      "folderUid": "atlas-internal",
       "editable": true,
       "panels": [
         {
@@ -40,19 +40,26 @@ data:
               },
               "mappings": [],
               "thresholds": {
-                "mode": "absolute",
+                "mode": "percentage",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "green",
                     "value": null
                   },
                   {
-                    "color": "green",
-                    "value": 1
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -97,19 +104,26 @@ data:
               },
               "mappings": [],
               "thresholds": {
-                "mode": "absolute",
+                "mode": "percentage",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "green",
                     "value": null
                   },
                   {
-                    "color": "green",
-                    "value": 1
+                    "color": "yellow",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
                   }
                 ]
               },
-              "unit": "percent"
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -166,7 +180,10 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI"
+              "unit": "bytesSI",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -223,7 +240,10 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI"
+              "unit": "bytesSI",
+              "custom": {
+                "displayMode": "auto"
+              }
             },
             "overrides": []
           },
@@ -244,20 +264,20 @@ data:
         {
           "id": 5,
           "type": "timeseries",
-          "title": "Root filesystem",
+          "title": "Astreae per-node usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 24,
+            "w": 12,
             "x": 0,
             "y": 5
           },
           "targets": [
             {
-              "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
+              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -281,21 +301,59 @@ data:
         },
         {
           "id": 6,
-          "type": "table",
-          "title": "Astreae nodes",
+          "type": "timeseries",
+          "title": "Asteria per-node usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 10,
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "timeFrom": "30d"
+        },
+        {
+          "id": 7,
+          "type": "timeseries",
+          "title": "Astreae usage history",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
             "w": 12,
             "x": 0,
             "y": 14
           },
           "targets": [
             {
-              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
               "refId": "A"
             }
           ],
@@ -306,32 +364,33 @@ data:
             "overrides": []
           },
           "options": {
-            "showHeader": true
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
             }
-          ]
+          },
+          "timeFrom": "90d"
         },
         {
-          "id": 7,
-          "type": "table",
-          "title": "Asteria nodes",
+          "id": 8,
+          "type": "timeseries",
+          "title": "Asteria usage history",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 10,
+            "h": 9,
             "w": 12,
             "x": 12,
             "y": 14
           },
           "targets": [
             {
-              "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
+              "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
               "refId": "A"
             }
           ],
@@ -342,14 +401,15 @@ data:
             "overrides": []
           },
           "options": {
-            "showHeader": true
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "legend": {
+              "displayMode": "table",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
             }
-          ]
+          },
+          "timeFrom": "90d"
         }
       ],
       "time": {
diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml
index d390679..c52b4e1 100644
--- a/services/monitoring/grafana-folders.yaml
+++ b/services/monitoring/grafana-folders.yaml
@@ -19,22 +19,8 @@ data:
             permission: Edit
           - role: Admin
             permission: Admin
-      - uid: atlas-pods
-        title: Atlas Pods
-        permissions:
-          - role: Editor
-            permission: View
-          - role: Admin
-            permission: Admin
-      - uid: atlas-nodes
-        title: Atlas Nodes
-        permissions:
-          - role: Editor
-            permission: View
-          - role: Admin
-            permission: Admin
-      - uid: atlas-storage
-        title: Atlas Storage
+      - uid: atlas-internal
+        title: Atlas Internal
         permissions:
           - role: Editor
             permission: View
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index e23f903..58035b6 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -288,7 +288,7 @@ spec:
               path: /var/lib/grafana/dashboards/overview
           - name: pods
             orgId: 1
-            folder: Atlas Pods
+            folder: Atlas Internal
             type: file
             disableDeletion: false
             editable: true
@@ -296,7 +296,7 @@ spec:
               path: /var/lib/grafana/dashboards/pods
           - name: nodes
             orgId: 1
-            folder: Atlas Nodes
+            folder: Atlas Internal
             type: file
             disableDeletion: false
             editable: true
@@ -304,17 +304,26 @@ spec:
               path: /var/lib/grafana/dashboards/nodes
           - name: storage
             orgId: 1
-            folder: Atlas Storage
+            folder: Atlas Internal
             type: file
             disableDeletion: false
             editable: true
             options:
               path: /var/lib/grafana/dashboards/storage
+          - name: network
+            orgId: 1
+            folder: Atlas Internal
+            type: file
+            disableDeletion: false
+            editable: true
+            options:
+              path: /var/lib/grafana/dashboards/network
     dashboardsConfigMaps:
       overview: grafana-dashboard-overview
       pods: grafana-dashboard-pods
       nodes: grafana-dashboard-nodes
       storage: grafana-dashboard-storage
+      network: grafana-dashboard-network
     extraConfigmapMounts:
       - name: grafana-folders
         mountPath: /etc/grafana/provisioning/folders
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 282ee4f..76263c1 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -9,5 +9,6 @@ resources:
   - grafana-dashboard-pods.yaml
   - grafana-dashboard-nodes.yaml
   - grafana-dashboard-storage.yaml
+  - grafana-dashboard-network.yaml
   - grafana-folders.yaml
   - helmrelease.yaml

From 349d9c56ac4db8c9fa18b0923acc0917411beb4c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 18:55:11 -0300
Subject: [PATCH 15/71] monitoring: polish dashboards

---
 scripts/render_dashboards.py                  | 55 +++++++++++--------
 .../monitoring/dashboards/atlas-network.json  | 21 +++----
 .../monitoring/dashboards/atlas-overview.json | 38 ++++++-------
 .../monitoring/dashboards/atlas-pods.json     |  4 +-
 .../monitoring/dashboards/atlas-storage.json  |  8 +--
 .../monitoring/grafana-dashboard-network.yaml | 21 +++----
 .../grafana-dashboard-overview.yaml           | 38 ++++++-------
 .../monitoring/grafana-dashboard-pods.yaml    |  4 +-
 .../monitoring/grafana-dashboard-storage.yaml |  8 +--
 9 files changed, 104 insertions(+), 93 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 67e486a..083ddfe 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -80,6 +80,7 @@ WORKER_TOTAL = len(WORKER_NODES)
 CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
 WORKER_SUFFIX = f"/{WORKER_TOTAL}"
 CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring"
+LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
 
 # ---------------------------------------------------------------------------
 # PromQL helpers
@@ -149,9 +150,10 @@ CRASHLOOP_EXPR = (
     '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
 )
 STUCK_TERMINATING_EXPR = (
-    'sum(max by (namespace,pod) (('
-    '(time() - kube_pod_deletion_timestamp{pod!=""}) > 600'
-    ') and on(namespace,pod) kube_pod_deletion_timestamp{pod!=""} > 0))'
+    'sum(max by (namespace,pod) ('
+    '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
+    ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
+    '))'
 )
 
 PROBLEM_TABLE_EXPR = (
@@ -168,9 +170,11 @@ CRASHLOOP_TABLE_EXPR = (
     "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
 )
 STUCK_TABLE_EXPR = (
+    "("
     "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
-    "* on(namespace,pod) group_left(node) kube_pod_info) "
-    "and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0"
+    "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
+    "* on(namespace,pod) group_left(node) kube_pod_info"
+    ")"
 )
 
 NAMESPACE_CPU_EXPR = (
@@ -192,6 +196,7 @@ IO_SERIES_EXPR = (
     "+ rate(node_disk_written_bytes_total[5m]))"
 )
 IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
+TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
     'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
     "or on() vector(0)"
@@ -216,6 +221,7 @@ def stat_panel(
     thresholds=None,
     text_mode="value",
     legend=None,
+    display_name=None,
     value_suffix=None,
     links=None,
 ):
@@ -236,6 +242,8 @@ def stat_panel(
     }
     if value_suffix:
         defaults["custom"]["valueSuffix"] = value_suffix
+    if display_name:
+        defaults["displayName"] = display_name
     panel = {
         "id": panel_id,
         "type": "stat",
@@ -449,8 +457,8 @@ def build_overview():
     hottest = [
         (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"),
         (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"),
-        (9, "Hottest node: NET", NET_TOP_EXPR, "bytes/sec"),
-        (10, "Hottest node: I/O", IO_TOP_EXPR, "bytes/sec"),
+        (9, "Hottest node: NET", NET_TOP_EXPR, "Bps"),
+        (10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(hottest):
         panels.append(
@@ -462,7 +470,7 @@ def build_overview():
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="value_and_name",
-                legend="{{node}}",
+                display_name="{{node}}",
                 links=link_to("atlas-nodes"),
             )
         )
@@ -544,7 +552,7 @@ def build_overview():
             "Cluster ingress throughput",
             NET_INGRESS_EXPR,
             {"h": 7, "w": 12, "x": 0, "y": 34},
-            unit="bytes/sec",
+            unit="Bps",
             legend_display="list",
             legend_placement="bottom",
             links=link_to("atlas-network"),
@@ -556,7 +564,7 @@ def build_overview():
             "Cluster egress throughput",
             NET_EGRESS_EXPR,
             {"h": 7, "w": 12, "x": 12, "y": 34},
-            unit="bytes/sec",
+            unit="Bps",
             legend_display="list",
             legend_placement="bottom",
             links=link_to("atlas-network"),
@@ -616,8 +624,8 @@ def build_overview():
     storage_panels = [
         (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
         (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
-        (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "bytesSI"),
-        (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "bytesSI"),
+        (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
+        (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
         panels.append(
@@ -911,7 +919,7 @@ def build_storage_dashboard():
             "Astreae free",
             astreae_free_expr("/mnt/astreae"),
             {"h": 5, "w": 6, "x": 12, "y": 0},
-            unit="bytesSI",
+            unit="decbytes",
         )
     )
     panels.append(
@@ -920,14 +928,14 @@ def build_storage_dashboard():
             "Asteria free",
             astreae_free_expr("/mnt/asteria"),
             {"h": 5, "w": 6, "x": 18, "y": 0},
-            unit="bytesSI",
+            unit="decbytes",
         )
     )
     panels.append(
         timeseries_panel(
             5,
             "Astreae per-node usage",
-            filesystem_usage_expr("/mnt/astreae"),
+            filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
             {"h": 9, "w": 12, "x": 0, "y": 5},
             unit="percent",
             legend="{{node}}",
@@ -940,7 +948,7 @@ def build_storage_dashboard():
         timeseries_panel(
             6,
             "Asteria per-node usage",
-            filesystem_usage_expr("/mnt/asteria"),
+            filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
             {"h": 9, "w": 12, "x": 12, "y": 5},
             unit="percent",
             legend="{{node}}",
@@ -986,18 +994,19 @@ def build_storage_dashboard():
 def build_network_dashboard():
     panels = []
     panels.append(
-        stat_panel(1, "Ingress bytes/s", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="bytes/sec")
+        stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps")
     )
     panels.append(
-        stat_panel(2, "Egress bytes/s", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="bytes/sec")
+        stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
     )
     panels.append(
         stat_panel(
             3,
             "Top router req/s",
-            'max(topk(1, rate(traefik_router_requests_total[5m])))',
+            f"topk(1, {TRAEFIK_ROUTER_EXPR})",
             {"h": 4, "w": 8, "x": 16, "y": 0},
             unit="req/s",
+            display_name="{{router}}",
         )
     )
     panels.append(
@@ -1006,7 +1015,7 @@ def build_network_dashboard():
             "Per-node throughput",
             NET_SERIES_EXPR,
             {"h": 8, "w": 24, "x": 0, "y": 4},
-            unit="bytes/sec",
+            unit="Bps",
             legend="{{node}}",
             legend_display="table",
             legend_placement="right",
@@ -1019,7 +1028,7 @@ def build_network_dashboard():
             'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
             '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
             {"h": 9, "w": 12, "x": 0, "y": 12},
-            unit="bytes/sec",
+            unit="Bps",
             transformations=[{"id": "labelsToFields", "options": {}}],
         )
     )
@@ -1030,7 +1039,7 @@ def build_network_dashboard():
             'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
             '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
             {"h": 9, "w": 12, "x": 12, "y": 12},
-            unit="bytes/sec",
+            unit="Bps",
             transformations=[{"id": "labelsToFields", "options": {}}],
         )
     )
@@ -1038,7 +1047,7 @@ def build_network_dashboard():
         timeseries_panel(
             7,
             "Traefik routers (req/s)",
-            'topk(10, rate(traefik_router_requests_total[5m]))',
+            f"topk(10, {TRAEFIK_ROUTER_EXPR})",
             {"h": 9, "w": 12, "x": 0, "y": 21},
             unit="req/s",
             legend="{{router}}",
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 3846d2a..369024f 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -7,7 +7,7 @@
     {
       "id": 1,
       "type": "stat",
-      "title": "Ingress bytes/s",
+      "title": "Ingress traffic",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -43,7 +43,7 @@
               }
             ]
           },
-          "unit": "bytes/sec",
+          "unit": "Bps",
           "custom": {
             "displayMode": "auto"
           }
@@ -67,7 +67,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Egress bytes/s",
+      "title": "Egress traffic",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -103,7 +103,7 @@
               }
             ]
           },
-          "unit": "bytes/sec",
+          "unit": "Bps",
           "custom": {
             "displayMode": "auto"
           }
@@ -140,7 +140,7 @@
       },
       "targets": [
         {
-          "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))",
+          "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
           "refId": "A"
         }
       ],
@@ -166,7 +166,8 @@
           "unit": "req/s",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{router}}"
         },
         "overrides": []
       },
@@ -207,7 +208,7 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "bytes/sec"
+          "unit": "Bps"
         },
         "overrides": []
       },
@@ -243,7 +244,7 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "bytes/sec"
+          "unit": "Bps"
         },
         "overrides": []
       },
@@ -279,7 +280,7 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "bytes/sec"
+          "unit": "Bps"
         },
         "overrides": []
       },
@@ -309,7 +310,7 @@
       },
       "targets": [
         {
-          "expr": "topk(10, rate(traefik_router_requests_total[5m]))",
+          "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))",
           "refId": "A",
           "legendFormat": "{{router}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 3377a13..ec7a848 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -339,7 +339,7 @@
       },
       "targets": [
         {
-          "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
+          "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
           "refId": "A"
         }
       ],
@@ -407,8 +407,7 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
+          "refId": "A"
         }
       ],
       "fieldConfig": {
@@ -437,7 +436,8 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{node}}"
         },
         "overrides": []
       },
@@ -479,8 +479,7 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
+          "refId": "A"
         }
       ],
       "fieldConfig": {
@@ -509,7 +508,8 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{node}}"
         },
         "overrides": []
       },
@@ -551,8 +551,7 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
+          "refId": "A"
         }
       ],
       "fieldConfig": {
@@ -574,10 +573,11 @@
               }
             ]
           },
-          "unit": "bytes/sec",
+          "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{node}}"
         },
         "overrides": []
       },
@@ -619,8 +619,7 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
+          "refId": "A"
         }
       ],
       "fieldConfig": {
@@ -642,10 +641,11 @@
               }
             ]
           },
-          "unit": "bytes/sec",
+          "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{node}}"
         },
         "overrides": []
       },
@@ -944,7 +944,7 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "bytes/sec"
+          "unit": "Bps"
         },
         "overrides": []
       },
@@ -987,7 +987,7 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "bytes/sec"
+          "unit": "Bps"
         },
         "overrides": []
       },
@@ -1306,7 +1306,7 @@
               }
             ]
           },
-          "unit": "bytesSI",
+          "unit": "decbytes",
           "custom": {
             "displayMode": "auto"
           }
@@ -1373,7 +1373,7 @@
               }
             ]
           },
-          "unit": "bytesSI",
+          "unit": "decbytes",
           "custom": {
             "displayMode": "auto"
           }
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index 3e7dd0e..8494e89 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -140,7 +140,7 @@
       },
       "targets": [
         {
-          "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
+          "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
           "refId": "A"
         }
       ],
@@ -332,7 +332,7 @@
       },
       "targets": [
         {
-          "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0",
+          "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json
index bb7d152..6585794 100644
--- a/services/monitoring/dashboards/atlas-storage.json
+++ b/services/monitoring/dashboards/atlas-storage.json
@@ -171,7 +171,7 @@
               }
             ]
           },
-          "unit": "bytesSI",
+          "unit": "decbytes",
           "custom": {
             "displayMode": "auto"
           }
@@ -231,7 +231,7 @@
               }
             ]
           },
-          "unit": "bytesSI",
+          "unit": "decbytes",
           "custom": {
             "displayMode": "auto"
           }
@@ -268,7 +268,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -306,7 +306,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index e1ba054..07c8b7a 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 1,
           "type": "stat",
-          "title": "Ingress bytes/s",
+          "title": "Ingress traffic",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -52,7 +52,7 @@ data:
                   }
                 ]
               },
-              "unit": "bytes/sec",
+              "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
               }
@@ -76,7 +76,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Egress bytes/s",
+          "title": "Egress traffic",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -112,7 +112,7 @@ data:
                   }
                 ]
               },
-              "unit": "bytes/sec",
+              "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
               }
@@ -149,7 +149,7 @@ data:
           },
           "targets": [
             {
-              "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))",
+              "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
               "refId": "A"
             }
           ],
@@ -175,7 +175,8 @@ data:
               "unit": "req/s",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{router}}"
             },
             "overrides": []
           },
@@ -216,7 +217,7 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "bytes/sec"
+              "unit": "Bps"
             },
             "overrides": []
           },
@@ -252,7 +253,7 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "bytes/sec"
+              "unit": "Bps"
             },
             "overrides": []
           },
@@ -288,7 +289,7 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "bytes/sec"
+              "unit": "Bps"
             },
             "overrides": []
           },
@@ -318,7 +319,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(10, rate(traefik_router_requests_total[5m]))",
+              "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))",
               "refId": "A",
               "legendFormat": "{{router}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 199dfb2..bb3bb11 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -348,7 +348,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
+              "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
               "refId": "A"
             }
           ],
@@ -416,8 +416,7 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
+              "refId": "A"
             }
           ],
           "fieldConfig": {
@@ -446,7 +445,8 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{node}}"
             },
             "overrides": []
           },
@@ -488,8 +488,7 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
+              "refId": "A"
             }
           ],
           "fieldConfig": {
@@ -518,7 +517,8 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{node}}"
             },
             "overrides": []
           },
@@ -560,8 +560,7 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
+              "refId": "A"
             }
           ],
           "fieldConfig": {
@@ -583,10 +582,11 @@ data:
                   }
                 ]
               },
-              "unit": "bytes/sec",
+              "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{node}}"
             },
             "overrides": []
           },
@@ -628,8 +628,7 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
+              "refId": "A"
             }
           ],
           "fieldConfig": {
@@ -651,10 +650,11 @@ data:
                   }
                 ]
               },
-              "unit": "bytes/sec",
+              "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{node}}"
             },
             "overrides": []
           },
@@ -953,7 +953,7 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "bytes/sec"
+              "unit": "Bps"
             },
             "overrides": []
           },
@@ -996,7 +996,7 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "bytes/sec"
+              "unit": "Bps"
             },
             "overrides": []
           },
@@ -1315,7 +1315,7 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI",
+              "unit": "decbytes",
               "custom": {
                 "displayMode": "auto"
               }
@@ -1382,7 +1382,7 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI",
+              "unit": "decbytes",
               "custom": {
                 "displayMode": "auto"
               }
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index 58cae77..e160eca 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -149,7 +149,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))",
+              "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
               "refId": "A"
             }
           ],
@@ -341,7 +341,7 @@ data:
           },
           "targets": [
             {
-              "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0",
+              "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml
index 99439fb..1bbf1ea 100644
--- a/services/monitoring/grafana-dashboard-storage.yaml
+++ b/services/monitoring/grafana-dashboard-storage.yaml
@@ -180,7 +180,7 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI",
+              "unit": "decbytes",
               "custom": {
                 "displayMode": "auto"
               }
@@ -240,7 +240,7 @@ data:
                   }
                 ]
               },
-              "unit": "bytesSI",
+              "unit": "decbytes",
               "custom": {
                 "displayMode": "auto"
               }
@@ -277,7 +277,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -315,7 +315,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }

From fe8deea9c728bce682be9ee7954c3ddab6a6dd7e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 19:24:03 -0300
Subject: [PATCH 16/71] monitoring: tighten overview stats

---
 scripts/render_dashboards.py                  | 29 ++++++---
 .../monitoring/dashboards/atlas-network.json  |  6 +-
 .../monitoring/dashboards/atlas-overview.json | 64 ++++++++++++++-----
 .../monitoring/grafana-dashboard-network.yaml |  6 +-
 .../grafana-dashboard-overview.yaml           | 64 ++++++++++++++-----
 5 files changed, 121 insertions(+), 48 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 083ddfe..b88d5a4 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -221,7 +221,6 @@ def stat_panel(
     thresholds=None,
     text_mode="value",
     legend=None,
-    display_name=None,
     value_suffix=None,
     links=None,
 ):
@@ -242,8 +241,6 @@ def stat_panel(
     }
     if value_suffix:
         defaults["custom"]["valueSuffix"] = value_suffix
-    if display_name:
-        defaults["displayName"] = display_name
     panel = {
         "id": panel_id,
         "type": "stat",
@@ -385,7 +382,7 @@ def build_overview():
         (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
         (
             2,
-            "Ready nodes",
+            "Ready workers",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
             WORKER_SUFFIX,
             WORKER_TOTAL,
@@ -426,20 +423,32 @@ def build_overview():
     ]
     for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
         thresholds = None
-        if panel_id in (2, 3):
+        if panel_id == 2:
             thresholds = {
                 "mode": "absolute",
                 "steps": [
                     {"color": "red", "value": None},
-                    {"color": "green", "value": ok_value},
+                    {"color": "orange", "value": WORKER_TOTAL - 2},
+                    {"color": "yellow", "value": WORKER_TOTAL - 1},
+                    {"color": "green", "value": WORKER_TOTAL},
                 ],
             }
-        elif panel_id >= 4:
+        elif panel_id == 3:
+            thresholds = {
+                "mode": "absolute",
+                "steps": [
+                    {"color": "red", "value": None},
+                    {"color": "green", "value": CONTROL_TOTAL},
+                ],
+            }
+        elif panel_id in (4, 5, 6):
             thresholds = {
                 "mode": "absolute",
                 "steps": [
                     {"color": "green", "value": None},
-                    {"color": "red", "value": 1},
+                    {"color": "yellow", "value": 1},
+                    {"color": "orange", "value": 2},
+                    {"color": "red", "value": 3},
                 ],
             }
         panels.append(
@@ -470,7 +479,7 @@ def build_overview():
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="value_and_name",
-                display_name="{{node}}",
+                legend="{{node}}",
                 links=link_to("atlas-nodes"),
             )
         )
@@ -1006,7 +1015,7 @@ def build_network_dashboard():
             f"topk(1, {TRAEFIK_ROUTER_EXPR})",
             {"h": 4, "w": 8, "x": 16, "y": 0},
             unit="req/s",
-            display_name="{{router}}",
+            legend="{{router}}",
         )
     )
     panels.append(
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 369024f..e412045 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -141,7 +141,8 @@
       "targets": [
         {
           "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{router}}"
         }
       ],
       "fieldConfig": {
@@ -166,8 +167,7 @@
           "unit": "req/s",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{router}}"
+          }
         },
         "overrides": []
       },
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index ec7a848..ec137f1 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -70,7 +70,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Ready nodes",
+      "title": "Ready workers",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -100,6 +100,14 @@
                 "color": "red",
                 "value": null
               },
+              {
+                "color": "orange",
+                "value": 16
+              },
+              {
+                "color": "yellow",
+                "value": 17
+              },
               {
                 "color": "green",
                 "value": 18
@@ -223,8 +231,16 @@
                 "value": null
               },
               {
-                "color": "red",
+                "color": "yellow",
                 "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 2
+              },
+              {
+                "color": "red",
+                "value": 3
               }
             ]
           },
@@ -290,8 +306,16 @@
                 "value": null
               },
               {
-                "color": "red",
+                "color": "yellow",
                 "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 2
+              },
+              {
+                "color": "red",
+                "value": 3
               }
             ]
           },
@@ -357,8 +381,16 @@
                 "value": null
               },
               {
-                "color": "red",
+                "color": "yellow",
                 "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 2
+              },
+              {
+                "color": "red",
+                "value": 3
               }
             ]
           },
@@ -407,7 +439,8 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{node}}"
         }
       ],
       "fieldConfig": {
@@ -436,8 +469,7 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{node}}"
+          }
         },
         "overrides": []
       },
@@ -479,7 +511,8 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{node}}"
         }
       ],
       "fieldConfig": {
@@ -508,8 +541,7 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{node}}"
+          }
         },
         "overrides": []
       },
@@ -551,7 +583,8 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{node}}"
         }
       ],
       "fieldConfig": {
@@ -576,8 +609,7 @@
           "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{node}}"
+          }
         },
         "overrides": []
       },
@@ -619,7 +651,8 @@
       "targets": [
         {
           "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{node}}"
         }
       ],
       "fieldConfig": {
@@ -644,8 +677,7 @@
           "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{node}}"
+          }
         },
         "overrides": []
       },
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 07c8b7a..6963e89 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -150,7 +150,8 @@ data:
           "targets": [
             {
               "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{router}}"
             }
           ],
           "fieldConfig": {
@@ -175,8 +176,7 @@ data:
               "unit": "req/s",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{router}}"
+              }
             },
             "overrides": []
           },
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index bb3bb11..12555ee 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -79,7 +79,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Ready nodes",
+          "title": "Ready workers",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -109,6 +109,14 @@ data:
                     "color": "red",
                     "value": null
                   },
+                  {
+                    "color": "orange",
+                    "value": 16
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 17
+                  },
                   {
                     "color": "green",
                     "value": 18
@@ -232,8 +240,16 @@ data:
                     "value": null
                   },
                   {
-                    "color": "red",
+                    "color": "yellow",
                     "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
                   }
                 ]
               },
@@ -299,8 +315,16 @@ data:
                     "value": null
                   },
                   {
-                    "color": "red",
+                    "color": "yellow",
                     "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
                   }
                 ]
               },
@@ -366,8 +390,16 @@ data:
                     "value": null
                   },
                   {
-                    "color": "red",
+                    "color": "yellow",
                     "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
                   }
                 ]
               },
@@ -416,7 +448,8 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -445,8 +478,7 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{node}}"
+              }
             },
             "overrides": []
           },
@@ -488,7 +520,8 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -517,8 +550,7 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{node}}"
+              }
             },
             "overrides": []
           },
@@ -560,7 +592,8 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -585,8 +618,7 @@ data:
               "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{node}}"
+              }
             },
             "overrides": []
           },
@@ -628,7 +660,8 @@ data:
           "targets": [
             {
               "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {
@@ -653,8 +686,7 @@ data:
               "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{node}}"
+              }
             },
             "overrides": []
           },

From a1e731e9299a5ca1908a4953d5d1729e9e6d74e9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 19:38:40 -0300
Subject: [PATCH 17/71] monitoring: fix hottest stats and titan-db scrape

---
 scripts/render_dashboards.py                  | 17 ++++++------
 .../monitoring/dashboards/atlas-network.json  |  3 ++-
 .../monitoring/dashboards/atlas-overview.json | 26 +++++++++++--------
 .../monitoring/grafana-dashboard-network.yaml |  3 ++-
 .../grafana-dashboard-overview.yaml           | 26 +++++++++++--------
 services/monitoring/helmrelease.yaml          | 10 +++++++
 6 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index b88d5a4..d726015 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -221,6 +221,7 @@ def stat_panel(
     thresholds=None,
     text_mode="value",
     legend=None,
+    instant=False,
     value_suffix=None,
     links=None,
 ):
@@ -259,6 +260,8 @@ def stat_panel(
     }
     if legend:
         panel["targets"][0]["legendFormat"] = legend
+    if instant:
+        panel["targets"][0]["instant"] = True
     if links:
         panel["links"] = links
     return panel
@@ -339,14 +342,8 @@ def pie_panel(panel_id, title, expr, grid):
         "title": title,
         "datasource": PROM_DS,
         "gridPos": grid,
-        "targets": [{"expr": expr, "refId": "A"}],
-        "fieldConfig": {
-            "defaults": {
-                "unit": "percent",
-                "displayName": "{{namespace}}",
-            },
-            "overrides": [],
-        },
+        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
+        "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
         "options": {
             "legend": {"displayMode": "list", "placement": "right"},
             "pieType": "pie",
@@ -382,7 +379,7 @@ def build_overview():
         (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
         (
             2,
-            "Ready workers",
+            "Workers ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
             WORKER_SUFFIX,
             WORKER_TOTAL,
@@ -480,6 +477,7 @@ def build_overview():
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="value_and_name",
                 legend="{{node}}",
+                instant=True,
                 links=link_to("atlas-nodes"),
             )
         )
@@ -1016,6 +1014,7 @@ def build_network_dashboard():
             {"h": 4, "w": 8, "x": 16, "y": 0},
             unit="req/s",
             legend="{{router}}",
+            instant=True,
         )
     )
     panels.append(
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index e412045..abd9da7 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -142,7 +142,8 @@
         {
           "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
           "refId": "A",
-          "legendFormat": "{{router}}"
+          "legendFormat": "{{router}}",
+          "instant": true
         }
       ],
       "fieldConfig": {
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index ec137f1..1442cf5 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -70,7 +70,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Ready workers",
+      "title": "Workers ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -440,7 +440,8 @@
         {
           "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}"
+          "legendFormat": "{{node}}",
+          "instant": true
         }
       ],
       "fieldConfig": {
@@ -512,7 +513,8 @@
         {
           "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}"
+          "legendFormat": "{{node}}",
+          "instant": true
         }
       ],
       "fieldConfig": {
@@ -584,7 +586,8 @@
         {
           "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
           "refId": "A",
-          "legendFormat": "{{node}}"
+          "legendFormat": "{{node}}",
+          "instant": true
         }
       ],
       "fieldConfig": {
@@ -652,7 +655,8 @@
         {
           "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
           "refId": "A",
-          "legendFormat": "{{node}}"
+          "legendFormat": "{{node}}",
+          "instant": true
         }
       ],
       "fieldConfig": {
@@ -719,13 +723,13 @@
       "targets": [
         {
           "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent",
-          "displayName": "{{namespace}}"
+          "unit": "percent"
         },
         "overrides": []
       },
@@ -761,13 +765,13 @@
       "targets": [
         {
           "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent",
-          "displayName": "{{namespace}}"
+          "unit": "percent"
         },
         "overrides": []
       },
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 6963e89..8f614ae 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -151,7 +151,8 @@ data:
             {
               "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
               "refId": "A",
-              "legendFormat": "{{router}}"
+              "legendFormat": "{{router}}",
+              "instant": true
             }
           ],
           "fieldConfig": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 12555ee..ac95eae 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -79,7 +79,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Ready workers",
+          "title": "Workers ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -449,7 +449,8 @@ data:
             {
               "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}"
+              "legendFormat": "{{node}}",
+              "instant": true
             }
           ],
           "fieldConfig": {
@@ -521,7 +522,8 @@ data:
             {
               "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}"
+              "legendFormat": "{{node}}",
+              "instant": true
             }
           ],
           "fieldConfig": {
@@ -593,7 +595,8 @@ data:
             {
               "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
               "refId": "A",
-              "legendFormat": "{{node}}"
+              "legendFormat": "{{node}}",
+              "instant": true
             }
           ],
           "fieldConfig": {
@@ -661,7 +664,8 @@ data:
             {
               "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
               "refId": "A",
-              "legendFormat": "{{node}}"
+              "legendFormat": "{{node}}",
+              "instant": true
             }
           ],
           "fieldConfig": {
@@ -728,13 +732,13 @@ data:
           "targets": [
             {
               "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent",
-              "displayName": "{{namespace}}"
+              "unit": "percent"
             },
             "overrides": []
           },
@@ -770,13 +774,13 @@ data:
           "targets": [
             {
               "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent",
-              "displayName": "{{namespace}}"
+              "unit": "percent"
             },
             "overrides": []
           },
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 58035b6..5a8f1ba 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -209,6 +209,16 @@ spec:
                 - action: keep
                   source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
                   regex: flux-system;flux
+            - job_name: "titan-db"
+              static_configs:
+                - targets: ["titan-db:9100"]
+              relabel_configs:
+                - source_labels: [__address__]
+                  target_label: instance
+              metric_relabel_configs:
+                - source_labels: [instance]
+                  target_label: node
+                  replacement: titan-db
 
 ---
 

From 41e8a6a5829fa54dd54f4f7e0b36020f0a8cc371 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 19:49:50 -0300
Subject: [PATCH 18/71] monitoring: reorder overview stats

---
 scripts/render_dashboards.py                  |  20 +--
 .../monitoring/dashboards/atlas-overview.json | 146 +++++++++---------
 .../grafana-dashboard-overview.yaml           | 146 +++++++++---------
 3 files changed, 156 insertions(+), 156 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index d726015..97d64cd 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -376,9 +376,8 @@ def build_overview():
     panels = []
 
     row1_stats = [
-        (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
         (
-            2,
+            1,
             "Workers ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
             WORKER_SUFFIX,
@@ -386,7 +385,7 @@ def build_overview():
             None,
         ),
         (
-            3,
+            2,
             "Control plane ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
             CONTROL_SUFFIX,
@@ -394,7 +393,7 @@ def build_overview():
             None,
         ),
         (
-            4,
+            3,
             "Control plane workloads",
             f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
             None,
@@ -402,7 +401,7 @@ def build_overview():
             link_to("atlas-pods"),
         ),
         (
-            5,
+            4,
             "Problem pods",
             PROBLEM_PODS_EXPR,
             None,
@@ -410,17 +409,18 @@ def build_overview():
             link_to("atlas-pods"),
         ),
         (
-            6,
+            5,
             "Stuck terminating",
             STUCK_TERMINATING_EXPR,
             None,
             1,
             link_to("atlas-pods"),
         ),
+        (6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
     ]
     for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
         thresholds = None
-        if panel_id == 2:
+        if panel_id == 1:
             thresholds = {
                 "mode": "absolute",
                 "steps": [
@@ -430,7 +430,7 @@ def build_overview():
                     {"color": "green", "value": WORKER_TOTAL},
                 ],
             }
-        elif panel_id == 3:
+        elif panel_id == 2:
             thresholds = {
                 "mode": "absolute",
                 "steps": [
@@ -438,7 +438,7 @@ def build_overview():
                     {"color": "green", "value": CONTROL_TOTAL},
                 ],
             }
-        elif panel_id in (4, 5, 6):
+        elif panel_id in (3, 4, 5):
             thresholds = {
                 "mode": "absolute",
                 "steps": [
@@ -475,7 +475,7 @@ def build_overview():
                 {"h": 5, "w": 6, "x": 6 * idx, "y": 5},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
-                text_mode="value_and_name",
+                text_mode="name_and_value",
                 legend="{{node}}",
                 instant=True,
                 links=link_to("atlas-nodes"),
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 1442cf5..d51d203 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -10,66 +10,6 @@
     {
       "id": 1,
       "type": "stat",
-      "title": "Running pods",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 5,
-        "w": 4,
-        "x": 0,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 2,
-      "type": "stat",
       "title": "Workers ready",
       "datasource": {
         "type": "prometheus",
@@ -78,7 +18,7 @@
       "gridPos": {
         "h": 5,
         "w": 4,
-        "x": 4,
+        "x": 0,
         "y": 0
       },
       "targets": [
@@ -137,7 +77,7 @@
       }
     },
     {
-      "id": 3,
+      "id": 2,
       "type": "stat",
       "title": "Control plane ready",
       "datasource": {
@@ -147,7 +87,7 @@
       "gridPos": {
         "h": 5,
         "w": 4,
-        "x": 8,
+        "x": 4,
         "y": 0
       },
       "targets": [
@@ -198,7 +138,7 @@
       }
     },
     {
-      "id": 4,
+      "id": 3,
       "type": "stat",
       "title": "Control plane workloads",
       "datasource": {
@@ -208,7 +148,7 @@
       "gridPos": {
         "h": 5,
         "w": 4,
-        "x": 12,
+        "x": 8,
         "y": 0
       },
       "targets": [
@@ -273,7 +213,7 @@
       ]
     },
     {
-      "id": 5,
+      "id": 4,
       "type": "stat",
       "title": "Problem pods",
       "datasource": {
@@ -283,7 +223,7 @@
       "gridPos": {
         "h": 5,
         "w": 4,
-        "x": 16,
+        "x": 12,
         "y": 0
       },
       "targets": [
@@ -348,7 +288,7 @@
       ]
     },
     {
-      "id": 6,
+      "id": 5,
       "type": "stat",
       "title": "Stuck terminating",
       "datasource": {
@@ -358,7 +298,7 @@
       "gridPos": {
         "h": 5,
         "w": 4,
-        "x": 20,
+        "x": 16,
         "y": 0
       },
       "targets": [
@@ -422,6 +362,66 @@
         }
       ]
     },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Running pods",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 4,
+        "x": 20,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
     {
       "id": 7,
       "type": "stat",
@@ -485,7 +485,7 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value_and_name"
+        "textMode": "name_and_value"
       },
       "links": [
         {
@@ -558,7 +558,7 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value_and_name"
+        "textMode": "name_and_value"
       },
       "links": [
         {
@@ -627,7 +627,7 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value_and_name"
+        "textMode": "name_and_value"
       },
       "links": [
         {
@@ -696,7 +696,7 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value_and_name"
+        "textMode": "name_and_value"
       },
       "links": [
         {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index ac95eae..8d03cf6 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -19,66 +19,6 @@ data:
         {
           "id": 1,
           "type": "stat",
-          "title": "Running pods",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 5,
-            "w": 4,
-            "x": 0,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 2,
-          "type": "stat",
           "title": "Workers ready",
           "datasource": {
             "type": "prometheus",
@@ -87,7 +27,7 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 4,
+            "x": 0,
             "y": 0
           },
           "targets": [
@@ -146,7 +86,7 @@ data:
           }
         },
         {
-          "id": 3,
+          "id": 2,
           "type": "stat",
           "title": "Control plane ready",
           "datasource": {
@@ -156,7 +96,7 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 8,
+            "x": 4,
             "y": 0
           },
           "targets": [
@@ -207,7 +147,7 @@ data:
           }
         },
         {
-          "id": 4,
+          "id": 3,
           "type": "stat",
           "title": "Control plane workloads",
           "datasource": {
@@ -217,7 +157,7 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 12,
+            "x": 8,
             "y": 0
           },
           "targets": [
@@ -282,7 +222,7 @@ data:
           ]
         },
         {
-          "id": 5,
+          "id": 4,
           "type": "stat",
           "title": "Problem pods",
           "datasource": {
@@ -292,7 +232,7 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 16,
+            "x": 12,
             "y": 0
           },
           "targets": [
@@ -357,7 +297,7 @@ data:
           ]
         },
         {
-          "id": 6,
+          "id": 5,
           "type": "stat",
           "title": "Stuck terminating",
           "datasource": {
@@ -367,7 +307,7 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 20,
+            "x": 16,
             "y": 0
           },
           "targets": [
@@ -431,6 +371,66 @@ data:
             }
           ]
         },
+        {
+          "id": 6,
+          "type": "stat",
+          "title": "Running pods",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 5,
+            "w": 4,
+            "x": 20,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
         {
           "id": 7,
           "type": "stat",
@@ -494,7 +494,7 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value_and_name"
+            "textMode": "name_and_value"
           },
           "links": [
             {
@@ -567,7 +567,7 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value_and_name"
+            "textMode": "name_and_value"
           },
           "links": [
             {
@@ -636,7 +636,7 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value_and_name"
+            "textMode": "name_and_value"
           },
           "links": [
             {
@@ -705,7 +705,7 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value_and_name"
+            "textMode": "name_and_value"
           },
           "links": [
             {

From bcaa0a33279e4b1223eac2e560eb925cdfdb8197 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 19:53:39 -0300
Subject: [PATCH 19/71] monitoring: show hottest node names

---
 scripts/render_dashboards.py                        |  4 ++++
 services/monitoring/dashboards/atlas-overview.json  | 12 ++++++++----
 services/monitoring/grafana-dashboard-overview.yaml | 12 ++++++++----
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 97d64cd..a9c319a 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -224,6 +224,7 @@ def stat_panel(
     instant=False,
     value_suffix=None,
     links=None,
+    display_name=None,
 ):
     """Return a Grafana stat panel definition."""
     defaults = {
@@ -242,6 +243,8 @@ def stat_panel(
     }
     if value_suffix:
         defaults["custom"]["valueSuffix"] = value_suffix
+    if display_name:
+        defaults["displayName"] = display_name
     panel = {
         "id": panel_id,
         "type": "stat",
@@ -478,6 +481,7 @@ def build_overview():
                 text_mode="name_and_value",
                 legend="{{node}}",
                 instant=True,
+                display_name="{{__field.labels.node}}\\n",
                 links=link_to("atlas-nodes"),
             )
         )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index d51d203..f0cceaf 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -470,7 +470,8 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{__field.labels.node}}\\n"
         },
         "overrides": []
       },
@@ -543,7 +544,8 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{__field.labels.node}}\\n"
         },
         "overrides": []
       },
@@ -612,7 +614,8 @@
           "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{__field.labels.node}}\\n"
         },
         "overrides": []
       },
@@ -681,7 +684,8 @@
           "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          }
+          },
+          "displayName": "{{__field.labels.node}}\\n"
         },
         "overrides": []
       },
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 8d03cf6..1839d8f 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -479,7 +479,8 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{__field.labels.node}}\\n"
             },
             "overrides": []
           },
@@ -552,7 +553,8 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{__field.labels.node}}\\n"
             },
             "overrides": []
           },
@@ -621,7 +623,8 @@ data:
               "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{__field.labels.node}}\\n"
             },
             "overrides": []
           },
@@ -690,7 +693,8 @@ data:
               "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              }
+              },
+              "displayName": "{{__field.labels.node}}\\n"
             },
             "overrides": []
           },

From 4aece7e5cb5e0972fb7419eaaae6ee02ef64909e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 19:56:57 -0300
Subject: [PATCH 20/71] monitoring: fix hottest node labels

---
 scripts/render_dashboards.py                  | 20 +++++++++-------
 .../monitoring/dashboards/atlas-overview.json | 24 +++++++------------
 .../grafana-dashboard-overview.yaml           | 24 +++++++------------
 3 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index a9c319a..acc1c38 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -144,6 +144,12 @@ def astreae_free_expr(mount):
     return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
 
 
+def hottest_stat_expr(inner_expr):
+    return (
+        f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")'
+    )
+
+
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
 CRASHLOOP_EXPR = (
     'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
@@ -224,7 +230,6 @@ def stat_panel(
     instant=False,
     value_suffix=None,
     links=None,
-    display_name=None,
 ):
     """Return a Grafana stat panel definition."""
     defaults = {
@@ -243,8 +248,6 @@ def stat_panel(
     }
     if value_suffix:
         defaults["custom"]["valueSuffix"] = value_suffix
-    if display_name:
-        defaults["displayName"] = display_name
     panel = {
         "id": panel_id,
         "type": "stat",
@@ -464,10 +467,10 @@ def build_overview():
         )
 
     hottest = [
-        (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"),
-        (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"),
-        (9, "Hottest node: NET", NET_TOP_EXPR, "Bps"),
-        (10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"),
+        (7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"),
+        (8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"),
+        (9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"),
+        (10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(hottest):
         panels.append(
@@ -479,9 +482,8 @@ def build_overview():
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="name_and_value",
-                legend="{{node}}",
+                legend=None,
                 instant=True,
-                display_name="{{__field.labels.node}}\\n",
                 links=link_to("atlas-nodes"),
             )
         )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index f0cceaf..ea4e40e 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -438,9 +438,8 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
-          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -470,8 +469,7 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{__field.labels.node}}\\n"
+          }
         },
         "overrides": []
       },
@@ -512,9 +510,8 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
-          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -544,8 +541,7 @@
           "unit": "percent",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{__field.labels.node}}\\n"
+          }
         },
         "overrides": []
       },
@@ -586,9 +582,8 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
+          "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
-          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -614,8 +609,7 @@
           "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{__field.labels.node}}\\n"
+          }
         },
         "overrides": []
       },
@@ -656,9 +650,8 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
+          "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
-          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -684,8 +677,7 @@
           "unit": "Bps",
           "custom": {
             "displayMode": "auto"
-          },
-          "displayName": "{{__field.labels.node}}\\n"
+          }
         },
         "overrides": []
       },
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 1839d8f..1df2956 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -447,9 +447,8 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
-              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -479,8 +478,7 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{__field.labels.node}}\\n"
+              }
             },
             "overrides": []
           },
@@ -521,9 +519,8 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
-              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -553,8 +550,7 @@ data:
               "unit": "percent",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{__field.labels.node}}\\n"
+              }
             },
             "overrides": []
           },
@@ -595,9 +591,8 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
+              "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
-              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -623,8 +618,7 @@ data:
               "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{__field.labels.node}}\\n"
+              }
             },
             "overrides": []
           },
@@ -665,9 +659,8 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
+              "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
-              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -693,8 +686,7 @@ data:
               "unit": "Bps",
               "custom": {
                 "displayMode": "auto"
-              },
-              "displayName": "{{__field.labels.node}}\\n"
+              }
             },
             "overrides": []
           },

From b28e7501b72d52ea5101f11c80e29fcc6946be14 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 20:00:40 -0300
Subject: [PATCH 21/71] monitoring: show hottest node labels

---
 scripts/render_dashboards.py                   | 18 ++++++------------
 .../monitoring/dashboards/atlas-overview.json  | 16 ++++++++++------
 .../monitoring/grafana-dashboard-overview.yaml | 16 ++++++++++------
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index acc1c38..e215ca8 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -144,12 +144,6 @@ def astreae_free_expr(mount):
     return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
 
 
-def hottest_stat_expr(inner_expr):
-    return (
-        f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")'
-    )
-
-
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
 CRASHLOOP_EXPR = (
     'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
@@ -467,22 +461,22 @@ def build_overview():
         )
 
     hottest = [
-        (7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"),
-        (8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"),
-        (9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"),
-        (10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"),
+        (7, "Hottest node: CPU", node_cpu_expr(), "percent"),
+        (8, "Hottest node: RAM", node_mem_expr(), "percent"),
+        (9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"),
+        (10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(hottest):
         panels.append(
             stat_panel(
                 panel_id,
                 title,
-                expr,
+                f"topk(1, {expr})",
                 {"h": 5, "w": 6, "x": 6 * idx, "y": 5},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="name_and_value",
-                legend=None,
+                legend="{{node}}",
                 instant=True,
                 links=link_to("atlas-nodes"),
             )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index ea4e40e..468ca8a 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -438,8 +438,9 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -510,8 +511,9 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -569,7 +571,7 @@
     {
       "id": 9,
       "type": "stat",
-      "title": "Hottest node: NET",
+      "title": "Hottest node: NET (rx+tx)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -582,8 +584,9 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
           "refId": "A",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -637,7 +640,7 @@
     {
       "id": 10,
       "type": "stat",
-      "title": "Hottest node: I/O",
+      "title": "Hottest node: I/O (r+w)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -650,8 +653,9 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
           "refId": "A",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 1df2956..dbcc916 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -447,8 +447,9 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -519,8 +520,9 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -578,7 +580,7 @@ data:
         {
           "id": 9,
           "type": "stat",
-          "title": "Hottest node: NET",
+          "title": "Hottest node: NET (rx+tx)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -591,8 +593,9 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
               "refId": "A",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -646,7 +649,7 @@ data:
         {
           "id": 10,
           "type": "stat",
-          "title": "Hottest node: I/O",
+          "title": "Hottest node: I/O (r+w)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -659,8 +662,9 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
               "refId": "A",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],

From a67a6a1f3a3e903dd86cff14819ad63b71ee5401 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 20:04:50 -0300
Subject: [PATCH 22/71] monitoring: tidy hottest node labels

---
 scripts/render_dashboards.py                        | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 8 ++++----
 services/monitoring/grafana-dashboard-overview.yaml | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index e215ca8..4f25ab5 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -476,7 +476,7 @@ def build_overview():
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="name_and_value",
-                legend="{{node}}",
+                legend="{{node}}\\n",
                 instant=True,
                 links=link_to("atlas-nodes"),
             )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 468ca8a..ad3a947 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -440,7 +440,7 @@
         {
           "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}",
+          "legendFormat": "{{node}}\\n",
           "instant": true
         }
       ],
@@ -513,7 +513,7 @@
         {
           "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}",
+          "legendFormat": "{{node}}\\n",
           "instant": true
         }
       ],
@@ -586,7 +586,7 @@
         {
           "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
           "refId": "A",
-          "legendFormat": "{{node}}",
+          "legendFormat": "{{node}}\\n",
           "instant": true
         }
       ],
@@ -655,7 +655,7 @@
         {
           "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
           "refId": "A",
-          "legendFormat": "{{node}}",
+          "legendFormat": "{{node}}\\n",
           "instant": true
         }
       ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index dbcc916..5f3062a 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -449,7 +449,7 @@ data:
             {
               "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}",
+              "legendFormat": "{{node}}\\n",
               "instant": true
             }
           ],
@@ -522,7 +522,7 @@ data:
             {
               "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}",
+              "legendFormat": "{{node}}\\n",
               "instant": true
             }
           ],
@@ -595,7 +595,7 @@ data:
             {
               "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
               "refId": "A",
-              "legendFormat": "{{node}}",
+              "legendFormat": "{{node}}\\n",
               "instant": true
             }
           ],
@@ -664,7 +664,7 @@ data:
             {
               "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
               "refId": "A",
-              "legendFormat": "{{node}}",
+              "legendFormat": "{{node}}\\n",
               "instant": true
             }
           ],

From b8998a3c6ab81493b52ebf18abc21a78ad6c01e9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 20:14:11 -0300
Subject: [PATCH 23/71] monitoring: attach nodes to net/io stats

---
 scripts/render_dashboards.py                  | 36 +++++++++++--------
 .../monitoring/dashboards/atlas-network.json  |  2 +-
 .../monitoring/dashboards/atlas-overview.json | 12 +++----
 .../monitoring/grafana-dashboard-network.yaml |  2 +-
 .../grafana-dashboard-overview.yaml           | 12 +++----
 5 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 4f25ab5..37f2607 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -144,6 +144,23 @@ def astreae_free_expr(mount):
     return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
 
 
+def node_net_expr(scope=""):
+    base = (
+        'sum by (instance) ('
+        'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
+        '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
+    )
+    return scoped_node_expr(base, scope)
+
+
+def node_io_expr(scope=""):
+    base = (
+        "sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
+        "+ rate(node_disk_written_bytes_total[5m]))"
+    )
+    return scoped_node_expr(base, scope)
+
+
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
 CRASHLOOP_EXPR = (
     'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
@@ -185,17 +202,6 @@ NAMESPACE_RAM_EXPR = (
     'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
     ',pod!=""}) by (namespace))'
 )
-NET_SERIES_EXPR = (
-    'avg by (node) ('
-    'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
-    '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
-)
-NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})"
-IO_SERIES_EXPR = (
-    "avg by (node) (rate(node_disk_read_bytes_total[5m]) "
-    "+ rate(node_disk_written_bytes_total[5m]))"
-)
-IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
     'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
@@ -463,8 +469,8 @@ def build_overview():
     hottest = [
         (7, "Hottest node: CPU", node_cpu_expr(), "percent"),
         (8, "Hottest node: RAM", node_mem_expr(), "percent"),
-        (9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"),
-        (10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"),
+        (9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"),
+        (10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(hottest):
         panels.append(
@@ -476,7 +482,7 @@ def build_overview():
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="name_and_value",
-                legend="{{node}}\\n",
+                legend="{{node}}",
                 instant=True,
                 links=link_to("atlas-nodes"),
             )
@@ -1021,7 +1027,7 @@ def build_network_dashboard():
         timeseries_panel(
             4,
             "Per-node throughput",
-            NET_SERIES_EXPR,
+            node_net_expr(),
             {"h": 8, "w": 24, "x": 0, "y": 4},
             unit="Bps",
             legend="{{node}}",
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index abd9da7..42026eb 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -202,7 +202,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
+          "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index ad3a947..be5dead 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -440,7 +440,7 @@
         {
           "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}\\n",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -513,7 +513,7 @@
         {
           "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}\\n",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -584,9 +584,9 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
+          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}\\n",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
@@ -653,9 +653,9 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
+          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
-          "legendFormat": "{{node}}\\n",
+          "legendFormat": "{{node}}",
           "instant": true
         }
       ],
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 8f614ae..8b5d50d 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -211,7 +211,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))",
+              "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 5f3062a..26e0454 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -449,7 +449,7 @@ data:
             {
               "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}\\n",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -522,7 +522,7 @@ data:
             {
               "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}\\n",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -593,9 +593,9 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
+              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}\\n",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],
@@ -662,9 +662,9 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
+              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
-              "legendFormat": "{{node}}\\n",
+              "legendFormat": "{{node}}",
               "instant": true
             }
           ],

From 53427cc8fa893d22814aa5d94c338fcd3db8107b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 20:19:20 -0300
Subject: [PATCH 24/71] monitoring: fix net/io legend labels

---
 scripts/render_dashboards.py                        | 10 +++++++---
 services/monitoring/dashboards/atlas-overview.json  |  8 ++++----
 services/monitoring/grafana-dashboard-overview.yaml |  8 ++++----
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 37f2607..cf9487f 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -144,6 +144,10 @@ def astreae_free_expr(mount):
     return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
 
 
+def topk_with_node(expr):
+    return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
+
+
 def node_net_expr(scope=""):
     base = (
         'sum by (instance) ('
@@ -469,15 +473,15 @@ def build_overview():
     hottest = [
         (7, "Hottest node: CPU", node_cpu_expr(), "percent"),
         (8, "Hottest node: RAM", node_mem_expr(), "percent"),
-        (9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"),
-        (10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"),
+        (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
+        (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(hottest):
         panels.append(
             stat_panel(
                 panel_id,
                 title,
-                f"topk(1, {expr})",
+                f"{expr}",
                 {"h": 5, "w": 6, "x": 6 * idx, "y": 5},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index be5dead..e116b34 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -438,7 +438,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -511,7 +511,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -584,7 +584,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -653,7 +653,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 26e0454..36f610b 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -447,7 +447,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -520,7 +520,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -593,7 +593,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -662,7 +662,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true

From 76d3dc6ae238e880dfe7f39d3a9b04b2ef3fbea0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 21:20:19 -0300
Subject: [PATCH 25/71] monitoring: restore top1 hottest stats

---
 scripts/render_dashboards.py                        | 8 ++++----
 services/monitoring/dashboards/atlas-network.json   | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 6 +++---
 services/monitoring/grafana-dashboard-network.yaml  | 2 +-
 services/monitoring/grafana-dashboard-overview.yaml | 6 +++---
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index cf9487f..5d5c049 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -151,8 +151,8 @@ def topk_with_node(expr):
 def node_net_expr(scope=""):
     base = (
         'sum by (instance) ('
-        'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
-        '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
+        'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
+        '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
     )
     return scoped_node_expr(base, scope)
 
@@ -471,8 +471,8 @@ def build_overview():
         )
 
     hottest = [
-        (7, "Hottest node: CPU", node_cpu_expr(), "percent"),
-        (8, "Hottest node: RAM", node_mem_expr(), "percent"),
+        (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
+        (8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
         (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
         (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
     ]
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 42026eb..0363b81 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -202,7 +202,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index e116b34..7f65265 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -438,7 +438,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -511,7 +511,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -584,7 +584,7 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 8b5d50d..2d7d989 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -211,7 +211,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 36f610b..c1f8715 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -447,7 +447,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -520,7 +520,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -593,7 +593,7 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true

From b59677615c7145657f8d67c99699b30bcf86314a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 21:48:12 -0300
Subject: [PATCH 26/71] monitoring: worker/control-plane splits

---
 scripts/render_dashboards.py                    | 17 +++++++++--------
 .../monitoring/dashboards/atlas-overview.json   | 16 ++++++++--------
 .../monitoring/grafana-dashboard-overview.yaml  | 16 ++++++++--------
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 5d5c049..cf34d6a 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -509,11 +509,12 @@ def build_overview():
         )
     )
 
+    worker_filter = f"{WORKER_REGEX}"
     panels.append(
         timeseries_panel(
             13,
-            "Cluster node CPU",
-            node_cpu_expr(),
+            "Worker node CPU",
+            node_cpu_expr(worker_filter),
             {"h": 8, "w": 12, "x": 0, "y": 19},
             unit="percent",
             legend="{{node}}",
@@ -526,8 +527,8 @@ def build_overview():
     panels.append(
         timeseries_panel(
             14,
-            "Cluster node RAM",
-            node_mem_expr(),
+            "Worker node RAM",
+            node_mem_expr(worker_filter),
             {"h": 8, "w": 12, "x": 12, "y": 19},
             unit="percent",
             legend="{{node}}",
@@ -541,8 +542,8 @@ def build_overview():
     panels.append(
         timeseries_panel(
             15,
-            "Control plane CPU (incl. titan-db)",
-            node_cpu_expr(CONTROL_ALL_REGEX),
+            "Control plane CPU",
+            node_cpu_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 0, "y": 27},
             unit="percent",
             legend="{{node}}",
@@ -553,8 +554,8 @@ def build_overview():
     panels.append(
         timeseries_panel(
             16,
-            "Control plane RAM (incl. titan-db)",
-            node_mem_expr(CONTROL_ALL_REGEX),
+            "Control plane RAM",
+            node_mem_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 12, "y": 27},
             unit="percent",
             legend="{{node}}",
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 7f65265..bd081a7 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -793,7 +793,7 @@
     {
       "id": 13,
       "type": "timeseries",
-      "title": "Cluster node CPU",
+      "title": "Worker node CPU",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -840,7 +840,7 @@
     {
       "id": 14,
       "type": "timeseries",
-      "title": "Cluster node RAM",
+      "title": "Worker node RAM",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -853,7 +853,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -887,7 +887,7 @@
     {
       "id": 15,
       "type": "timeseries",
-      "title": "Control plane CPU (incl. titan-db)",
+      "title": "Control plane CPU",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -900,7 +900,7 @@
       },
       "targets": [
         {
-          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -924,7 +924,7 @@
     {
       "id": 16,
       "type": "timeseries",
-      "title": "Control plane RAM (incl. titan-db)",
+      "title": "Control plane RAM",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -937,7 +937,7 @@
       },
       "targets": [
         {
-          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index c1f8715..fb3d111 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -802,7 +802,7 @@ data:
         {
           "id": 13,
           "type": "timeseries",
-          "title": "Cluster node CPU",
+          "title": "Worker node CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -849,7 +849,7 @@ data:
         {
           "id": 14,
           "type": "timeseries",
-          "title": "Cluster node RAM",
+          "title": "Worker node RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -862,7 +862,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -896,7 +896,7 @@ data:
         {
           "id": 15,
           "type": "timeseries",
-          "title": "Control plane CPU (incl. titan-db)",
+          "title": "Control plane CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -909,7 +909,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -933,7 +933,7 @@ data:
         {
           "id": 16,
           "type": "timeseries",
-          "title": "Control plane RAM (incl. titan-db)",
+          "title": "Control plane RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -946,7 +946,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }

From be6052c47c18c1bcef61af0046fa77d432a369cc Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 21:57:40 -0300
Subject: [PATCH 27/71] monitoring: unify namespace share panels

---
 scripts/render_dashboards.py                  | 25 ++++++++++++++-----
 .../monitoring/dashboards/atlas-overview.json |  4 +--
 .../grafana-dashboard-overview.yaml           |  4 +--
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index cf34d6a..3c0d6fa 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -165,6 +165,14 @@ def node_io_expr(scope=""):
     return scoped_node_expr(base, scope)
 
 
+def namespace_cpu_share_expr():
+    return f"({NAMESPACE_CPU_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})"
+
+
+def namespace_ram_share_expr():
+    return f"({NAMESPACE_RAM_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})"
+
+
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
 CRASHLOOP_EXPR = (
     'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
@@ -199,12 +207,17 @@ STUCK_TABLE_EXPR = (
 )
 
 NAMESPACE_CPU_EXPR = (
-    'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""'
-    ',pod!=""}[5m])) by (namespace))'
+    'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
 )
 NAMESPACE_RAM_EXPR = (
-    'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
-    ',pod!=""}) by (namespace))'
+    'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
+)
+NAMESPACE_COMBINED_FILTER = (
+    'topk(10, ('
+    + NAMESPACE_CPU_EXPR
+    + ") + ("
+    + NAMESPACE_RAM_EXPR
+    + ' / 1e9))'
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
@@ -496,7 +509,7 @@ def build_overview():
         pie_panel(
             11,
             "Namespace CPU share",
-            NAMESPACE_CPU_EXPR,
+            namespace_cpu_share_expr(),
             {"h": 9, "w": 12, "x": 0, "y": 10},
         )
     )
@@ -504,7 +517,7 @@ def build_overview():
         pie_panel(
             12,
             "Namespace RAM share",
-            NAMESPACE_RAM_EXPR,
+            namespace_ram_share_expr(),
             {"h": 9, "w": 12, "x": 12, "y": 10},
         )
     )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index bd081a7..7529ae8 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
+          "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
+          "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index fb3d111..ea3523c 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
+              "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
+              "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From 37e51b361bed4512455086f0562ea44e82c71e9e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 22:06:06 -0300
Subject: [PATCH 28/71] monitoring: normalize namespace share

---
 scripts/render_dashboards.py                   | 18 ++++++++++++------
 .../monitoring/dashboards/atlas-overview.json  |  4 ++--
 .../monitoring/grafana-dashboard-overview.yaml |  4 ++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 3c0d6fa..a3ffb94 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -166,11 +166,17 @@ def node_io_expr(scope=""):
 
 
 def namespace_cpu_share_expr():
-    return f"({NAMESPACE_CPU_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})"
+    return (
+        f"(100 * ( {NAMESPACE_CPU_RAW} ) / sum( {NAMESPACE_CPU_RAW} )) * on(namespace) group_left() "
+        f"( {NAMESPACE_COMBINED_FILTER} )"
+    )
 
 
 def namespace_ram_share_expr():
-    return f"({NAMESPACE_RAM_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})"
+    return (
+        f"(100 * ( {NAMESPACE_RAM_RAW} ) / sum( {NAMESPACE_RAM_RAW} )) * on(namespace) group_left() "
+        f"( {NAMESPACE_COMBINED_FILTER} )"
+    )
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
@@ -206,17 +212,17 @@ STUCK_TABLE_EXPR = (
     ")"
 )
 
-NAMESPACE_CPU_EXPR = (
+NAMESPACE_CPU_RAW = (
     'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
 )
-NAMESPACE_RAM_EXPR = (
+NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
-    + NAMESPACE_CPU_EXPR
+    + NAMESPACE_CPU_RAW
     + ") + ("
-    + NAMESPACE_RAM_EXPR
+    + NAMESPACE_RAM_RAW
     + ' / 1e9))'
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 7529ae8..572f439 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
+          "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
+          "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index ea3523c..77f73ef 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
+              "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))",
+              "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From cc62f497e93f46c9f1f4d518c097fc529626fd46 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 22:19:01 -0300
Subject: [PATCH 29/71] monitoring: fix namespace share percentages

---
 scripts/render_dashboards.py                        | 12 ++++--------
 services/monitoring/dashboards/atlas-overview.json  |  4 ++--
 services/monitoring/grafana-dashboard-overview.yaml |  4 ++--
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index a3ffb94..74c8f7a 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -166,17 +166,13 @@ def node_io_expr(scope=""):
 
 
 def namespace_cpu_share_expr():
-    return (
-        f"(100 * ( {NAMESPACE_CPU_RAW} ) / sum( {NAMESPACE_CPU_RAW} )) * on(namespace) group_left() "
-        f"( {NAMESPACE_COMBINED_FILTER} )"
-    )
+    selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    return f"100 * ( {selected} ) / sum( {selected} )"
 
 
 def namespace_ram_share_expr():
-    return (
-        f"(100 * ( {NAMESPACE_RAM_RAW} ) / sum( {NAMESPACE_RAM_RAW} )) * on(namespace) group_left() "
-        f"( {NAMESPACE_COMBINED_FILTER} )"
-    )
+    selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    return f"100 * ( {selected} ) / sum( {selected} )"
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 572f439..753a33d 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 77f73ef..97bc359 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From 255e014e0a188544deb0d7f9d29288b567f24810 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 22:36:50 -0300
Subject: [PATCH 30/71] monitoring: color namespace pies

---
 scripts/render_dashboards.py                  |  35 +-
 .../monitoring/dashboards/atlas-overview.json | 322 +++++++++++++++++-
 .../grafana-dashboard-overview.yaml           | 322 +++++++++++++++++-
 3 files changed, 670 insertions(+), 9 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 74c8f7a..bec895a 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -361,6 +361,18 @@ def table_panel(
 
 def pie_panel(panel_id, title, expr, grid):
     """Return a pie chart panel with readable namespace labels."""
+    palette = [
+        "#EF476F",
+        "#FFD166",
+        "#06D6A0",
+        "#118AB2",
+        "#073B4C",
+        "#F78C6B",
+        "#8EECF5",
+        "#E0FF4F",
+        "#B5179E",
+        "#52B788",
+    ]
     return {
         "id": panel_id,
         "type": "piechart",
@@ -368,7 +380,28 @@ def pie_panel(panel_id, title, expr, grid):
         "datasource": PROM_DS,
         "gridPos": grid,
         "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
-        "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
+        "fieldConfig": {
+            "defaults": {
+                "unit": "percent",
+                "custom": {"gradientMode": "scheme"},
+                "color": {"mode": "palette-classic"},
+            },
+            "overrides": [
+                {
+                    "matcher": {"id": "byIndex", "options": idx},
+                    "properties": [
+                        {
+                            "id": "color",
+                            "value": {
+                                "mode": "fixed",
+                                "fixedColor": palette[idx % len(palette)],
+                            },
+                        }
+                    ],
+                }
+                for idx in range(10)
+            ],
+        },
         "options": {
             "legend": {"displayMode": "list", "placement": "right"},
             "pieType": "pie",
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 753a33d..8be7f9d 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -729,9 +729,166 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent"
+          "unit": "percent",
+          "custom": {
+            "gradientMode": "scheme"
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
         },
-        "overrides": []
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 0
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#EF476F"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 1
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#FFD166"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 2
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#06D6A0"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 3
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#118AB2"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 4
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#073B4C"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 5
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#F78C6B"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 6
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#8EECF5"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 7
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#E0FF4F"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 8
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#B5179E"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 9
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#52B788"
+                }
+              }
+            ]
+          }
+        ]
       },
       "options": {
         "legend": {
@@ -771,9 +928,166 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent"
+          "unit": "percent",
+          "custom": {
+            "gradientMode": "scheme"
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
         },
-        "overrides": []
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 0
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#EF476F"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 1
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#FFD166"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 2
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#06D6A0"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 3
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#118AB2"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 4
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#073B4C"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 5
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#F78C6B"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 6
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#8EECF5"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 7
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#E0FF4F"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 8
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#B5179E"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byIndex",
+              "options": 9
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "#52B788"
+                }
+              }
+            ]
+          }
+        ]
       },
       "options": {
         "legend": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 97bc359..06a40c7 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -738,9 +738,166 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent"
+              "unit": "percent",
+              "custom": {
+                "gradientMode": "scheme"
+              },
+              "color": {
+                "mode": "palette-classic"
+              }
             },
-            "overrides": []
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 0
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#EF476F"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 1
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#FFD166"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 2
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#06D6A0"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 3
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#118AB2"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 4
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#073B4C"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 5
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#F78C6B"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 6
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#8EECF5"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 7
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#E0FF4F"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 8
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#B5179E"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 9
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#52B788"
+                    }
+                  }
+                ]
+              }
+            ]
           },
           "options": {
             "legend": {
@@ -780,9 +937,166 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent"
+              "unit": "percent",
+              "custom": {
+                "gradientMode": "scheme"
+              },
+              "color": {
+                "mode": "palette-classic"
+              }
             },
-            "overrides": []
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 0
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#EF476F"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 1
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#FFD166"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 2
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#06D6A0"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 3
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#118AB2"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 4
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#073B4C"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 5
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#F78C6B"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 6
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#8EECF5"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 7
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#E0FF4F"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 8
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#B5179E"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byIndex",
+                  "options": 9
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "#52B788"
+                    }
+                  }
+                ]
+              }
+            ]
           },
           "options": {
             "legend": {

From 442a89d327b8f198546b68a42154d1555f71c7c2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 22:39:50 -0300
Subject: [PATCH 31/71] monitoring: fix pie colors & thresholds

---
 scripts/render_dashboards.py                  |  41 +--
 .../monitoring/dashboards/atlas-overview.json | 332 +-----------------
 .../grafana-dashboard-overview.yaml           | 332 +-----------------
 3 files changed, 22 insertions(+), 683 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index bec895a..2b0af09 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -361,18 +361,6 @@ def table_panel(
 
 def pie_panel(panel_id, title, expr, grid):
     """Return a pie chart panel with readable namespace labels."""
-    palette = [
-        "#EF476F",
-        "#FFD166",
-        "#06D6A0",
-        "#118AB2",
-        "#073B4C",
-        "#F78C6B",
-        "#8EECF5",
-        "#E0FF4F",
-        "#B5179E",
-        "#52B788",
-    ]
     return {
         "id": panel_id,
         "type": "piechart",
@@ -380,28 +368,7 @@ def pie_panel(panel_id, title, expr, grid):
         "datasource": PROM_DS,
         "gridPos": grid,
         "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
-        "fieldConfig": {
-            "defaults": {
-                "unit": "percent",
-                "custom": {"gradientMode": "scheme"},
-                "color": {"mode": "palette-classic"},
-            },
-            "overrides": [
-                {
-                    "matcher": {"id": "byIndex", "options": idx},
-                    "properties": [
-                        {
-                            "id": "color",
-                            "value": {
-                                "mode": "fixed",
-                                "fixedColor": palette[idx % len(palette)],
-                            },
-                        }
-                    ],
-                }
-                for idx in range(10)
-            ],
-        },
+        "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
         "options": {
             "legend": {"displayMode": "list", "placement": "right"},
             "pieType": "pie",
@@ -482,7 +449,7 @@ def build_overview():
             thresholds = {
                 "mode": "absolute",
                 "steps": [
-                    {"color": "red", "value": None},
+                    {"color": "red", "value": 0},
                     {"color": "orange", "value": WORKER_TOTAL - 2},
                     {"color": "yellow", "value": WORKER_TOTAL - 1},
                     {"color": "green", "value": WORKER_TOTAL},
@@ -492,7 +459,7 @@ def build_overview():
             thresholds = {
                 "mode": "absolute",
                 "steps": [
-                    {"color": "red", "value": None},
+                    {"color": "red", "value": 0},
                     {"color": "green", "value": CONTROL_TOTAL},
                 ],
             }
@@ -500,7 +467,7 @@ def build_overview():
             thresholds = {
                 "mode": "absolute",
                 "steps": [
-                    {"color": "green", "value": None},
+                    {"color": "green", "value": 0},
                     {"color": "yellow", "value": 1},
                     {"color": "orange", "value": 2},
                     {"color": "red", "value": 3},
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 8be7f9d..4481904 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -38,7 +38,7 @@
             "steps": [
               {
                 "color": "red",
-                "value": null
+                "value": 0
               },
               {
                 "color": "orange",
@@ -107,7 +107,7 @@
             "steps": [
               {
                 "color": "red",
-                "value": null
+                "value": 0
               },
               {
                 "color": "green",
@@ -168,7 +168,7 @@
             "steps": [
               {
                 "color": "green",
-                "value": null
+                "value": 0
               },
               {
                 "color": "yellow",
@@ -243,7 +243,7 @@
             "steps": [
               {
                 "color": "green",
-                "value": null
+                "value": 0
               },
               {
                 "color": "yellow",
@@ -318,7 +318,7 @@
             "steps": [
               {
                 "color": "green",
-                "value": null
+                "value": 0
               },
               {
                 "color": "yellow",
@@ -729,166 +729,9 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent",
-          "custom": {
-            "gradientMode": "scheme"
-          },
-          "color": {
-            "mode": "palette-classic"
-          }
+          "unit": "percent"
         },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 0
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#EF476F"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 1
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#FFD166"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 2
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#06D6A0"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 3
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#118AB2"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 4
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#073B4C"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 5
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#F78C6B"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 6
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#8EECF5"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 7
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#E0FF4F"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 8
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#B5179E"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 9
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#52B788"
-                }
-              }
-            ]
-          }
-        ]
+        "overrides": []
       },
       "options": {
         "legend": {
@@ -928,166 +771,9 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent",
-          "custom": {
-            "gradientMode": "scheme"
-          },
-          "color": {
-            "mode": "palette-classic"
-          }
+          "unit": "percent"
         },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 0
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#EF476F"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 1
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#FFD166"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 2
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#06D6A0"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 3
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#118AB2"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 4
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#073B4C"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 5
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#F78C6B"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 6
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#8EECF5"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 7
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#E0FF4F"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 8
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#B5179E"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byIndex",
-              "options": 9
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "mode": "fixed",
-                  "fixedColor": "#52B788"
-                }
-              }
-            ]
-          }
-        ]
+        "overrides": []
       },
       "options": {
         "legend": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 06a40c7..926c71c 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -47,7 +47,7 @@ data:
                 "steps": [
                   {
                     "color": "red",
-                    "value": null
+                    "value": 0
                   },
                   {
                     "color": "orange",
@@ -116,7 +116,7 @@ data:
                 "steps": [
                   {
                     "color": "red",
-                    "value": null
+                    "value": 0
                   },
                   {
                     "color": "green",
@@ -177,7 +177,7 @@ data:
                 "steps": [
                   {
                     "color": "green",
-                    "value": null
+                    "value": 0
                   },
                   {
                     "color": "yellow",
@@ -252,7 +252,7 @@ data:
                 "steps": [
                   {
                     "color": "green",
-                    "value": null
+                    "value": 0
                   },
                   {
                     "color": "yellow",
@@ -327,7 +327,7 @@ data:
                 "steps": [
                   {
                     "color": "green",
-                    "value": null
+                    "value": 0
                   },
                   {
                     "color": "yellow",
@@ -738,166 +738,9 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent",
-              "custom": {
-                "gradientMode": "scheme"
-              },
-              "color": {
-                "mode": "palette-classic"
-              }
+              "unit": "percent"
             },
-            "overrides": [
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 0
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#EF476F"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 1
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#FFD166"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 2
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#06D6A0"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 3
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#118AB2"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 4
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#073B4C"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 5
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#F78C6B"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 6
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#8EECF5"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 7
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#E0FF4F"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 8
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#B5179E"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 9
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#52B788"
-                    }
-                  }
-                ]
-              }
-            ]
+            "overrides": []
           },
           "options": {
             "legend": {
@@ -937,166 +780,9 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent",
-              "custom": {
-                "gradientMode": "scheme"
-              },
-              "color": {
-                "mode": "palette-classic"
-              }
+              "unit": "percent"
             },
-            "overrides": [
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 0
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#EF476F"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 1
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#FFD166"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 2
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#06D6A0"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 3
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#118AB2"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 4
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#073B4C"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 5
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#F78C6B"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 6
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#8EECF5"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 7
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#E0FF4F"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 8
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#B5179E"
-                    }
-                  }
-                ]
-              },
-              {
-                "matcher": {
-                  "id": "byIndex",
-                  "options": 9
-                },
-                "properties": [
-                  {
-                    "id": "color",
-                    "value": {
-                      "mode": "fixed",
-                      "fixedColor": "#52B788"
-                    }
-                  }
-                ]
-              }
-            ]
+            "overrides": []
           },
           "options": {
             "legend": {

From c53c5183012e452ffbc15a719f2f6cf09aa48898 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 22:58:57 -0300
Subject: [PATCH 32/71] monitoring: express namespace share as cluster percent

---
 scripts/render_dashboards.py                        | 4 ++--
 services/monitoring/dashboards/atlas-overview.json  | 4 ++--
 services/monitoring/grafana-dashboard-overview.yaml | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 2b0af09..4e8e5a5 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -167,12 +167,12 @@ def node_io_expr(scope=""):
 
 def namespace_cpu_share_expr():
     selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {selected} )"
+    return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
 
 
 def namespace_ram_share_expr():
     selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {selected} )"
+    return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 4481904..55c1909 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 926c71c..deeeacc 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From 0708522b280fb3d0978f75458e979360f353f740 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 23:12:16 -0300
Subject: [PATCH 33/71] monitoring: add namespace gpu share

---
 scripts/render_dashboards.py                  | 46 ++++++++----
 .../monitoring/dashboards/atlas-overview.json | 72 +++++++++++++++----
 .../grafana-dashboard-overview.yaml           | 72 +++++++++++++++----
 3 files changed, 145 insertions(+), 45 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 4e8e5a5..1248984 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -175,6 +175,11 @@ def namespace_ram_share_expr():
     return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
 
 
+def namespace_gpu_share_expr():
+    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )"
+
+
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
 CRASHLOOP_EXPR = (
     'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
@@ -214,6 +219,9 @@ NAMESPACE_CPU_RAW = (
 NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
+NAMESPACE_GPU_RAW = (
+    'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
+)
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
@@ -512,7 +520,7 @@ def build_overview():
             11,
             "Namespace CPU share",
             namespace_cpu_share_expr(),
-            {"h": 9, "w": 12, "x": 0, "y": 10},
+            {"h": 9, "w": 8, "x": 0, "y": 10},
         )
     )
     panels.append(
@@ -520,14 +528,22 @@ def build_overview():
             12,
             "Namespace RAM share",
             namespace_ram_share_expr(),
-            {"h": 9, "w": 12, "x": 12, "y": 10},
+            {"h": 9, "w": 8, "x": 8, "y": 10},
+        )
+    )
+    panels.append(
+        pie_panel(
+            13,
+            "Namespace GPU share",
+            namespace_gpu_share_expr(),
+            {"h": 9, "w": 8, "x": 16, "y": 10},
         )
     )
 
     worker_filter = f"{WORKER_REGEX}"
     panels.append(
         timeseries_panel(
-            13,
+            14,
             "Worker node CPU",
             node_cpu_expr(worker_filter),
             {"h": 8, "w": 12, "x": 0, "y": 19},
@@ -541,7 +557,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            14,
+            15,
             "Worker node RAM",
             node_mem_expr(worker_filter),
             {"h": 8, "w": 12, "x": 12, "y": 19},
@@ -556,7 +572,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            15,
+            16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 0, "y": 27},
@@ -568,7 +584,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            16,
+            17,
             "Control plane RAM",
             node_mem_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 12, "y": 27},
@@ -581,7 +597,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            17,
+            18,
             "Cluster ingress throughput",
             NET_INGRESS_EXPR,
             {"h": 7, "w": 12, "x": 0, "y": 34},
@@ -593,7 +609,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            18,
+            19,
             "Cluster egress throughput",
             NET_EGRESS_EXPR,
             {"h": 7, "w": 12, "x": 12, "y": 34},
@@ -606,7 +622,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            19,
+            20,
             "Root filesystem usage",
             root_usage_expr(),
             {"h": 8, "w": 12, "x": 0, "y": 41},
@@ -621,7 +637,7 @@ def build_overview():
     )
     panels.append(
         {
-            "id": 20,
+            "id": 21,
             "type": "bargauge",
             "title": "Nodes closest to full root disks",
             "datasource": PROM_DS,
@@ -655,10 +671,10 @@ def build_overview():
     )
 
     storage_panels = [
-        (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
-        (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
-        (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
-        (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
+        (22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
+        (23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
+        (24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
+        (25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
         panels.append(
@@ -675,7 +691,7 @@ def build_overview():
 
     panels.append(
         text_panel(
-            25,
+            26,
             "About this dashboard",
             textwrap.dedent(
                 """\
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 55c1909..47aa5da 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -716,7 +716,7 @@
       },
       "gridPos": {
         "h": 9,
-        "w": 12,
+        "w": 8,
         "x": 0,
         "y": 10
       },
@@ -758,8 +758,8 @@
       },
       "gridPos": {
         "h": 9,
-        "w": 12,
-        "x": 12,
+        "w": 8,
+        "x": 8,
         "y": 10
       },
       "targets": [
@@ -792,6 +792,48 @@
     },
     {
       "id": 13,
+      "type": "piechart",
+      "title": "Namespace GPU share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 16,
+        "y": 10
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 14,
       "type": "timeseries",
       "title": "Worker node CPU",
       "datasource": {
@@ -838,7 +880,7 @@
       ]
     },
     {
-      "id": 14,
+      "id": 15,
       "type": "timeseries",
       "title": "Worker node RAM",
       "datasource": {
@@ -885,7 +927,7 @@
       ]
     },
     {
-      "id": 15,
+      "id": 16,
       "type": "timeseries",
       "title": "Control plane CPU",
       "datasource": {
@@ -922,7 +964,7 @@
       }
     },
     {
-      "id": 16,
+      "id": 17,
       "type": "timeseries",
       "title": "Control plane RAM",
       "datasource": {
@@ -959,7 +1001,7 @@
       }
     },
     {
-      "id": 17,
+      "id": 18,
       "type": "timeseries",
       "title": "Cluster ingress throughput",
       "datasource": {
@@ -1002,7 +1044,7 @@
       ]
     },
     {
-      "id": 18,
+      "id": 19,
       "type": "timeseries",
       "title": "Cluster egress throughput",
       "datasource": {
@@ -1045,7 +1087,7 @@
       ]
     },
     {
-      "id": 19,
+      "id": 20,
       "type": "timeseries",
       "title": "Root filesystem usage",
       "datasource": {
@@ -1093,7 +1135,7 @@
       ]
     },
     {
-      "id": 20,
+      "id": 21,
       "type": "bargauge",
       "title": "Nodes closest to full root disks",
       "datasource": {
@@ -1162,7 +1204,7 @@
       ]
     },
     {
-      "id": 21,
+      "id": 22,
       "type": "stat",
       "title": "Astreae usage",
       "datasource": {
@@ -1233,7 +1275,7 @@
       ]
     },
     {
-      "id": 22,
+      "id": 23,
       "type": "stat",
       "title": "Asteria usage",
       "datasource": {
@@ -1304,7 +1346,7 @@
       ]
     },
     {
-      "id": 23,
+      "id": 24,
       "type": "stat",
       "title": "Astreae free",
       "datasource": {
@@ -1371,7 +1413,7 @@
       ]
     },
     {
-      "id": 24,
+      "id": 25,
       "type": "stat",
       "title": "Asteria free",
       "datasource": {
@@ -1438,7 +1480,7 @@
       ]
     },
     {
-      "id": 25,
+      "id": 26,
       "type": "text",
       "title": "About this dashboard",
       "gridPos": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index deeeacc..96136d7 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -725,7 +725,7 @@ data:
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
+            "w": 8,
             "x": 0,
             "y": 10
           },
@@ -767,8 +767,8 @@ data:
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
-            "x": 12,
+            "w": 8,
+            "x": 8,
             "y": 10
           },
           "targets": [
@@ -801,6 +801,48 @@ data:
         },
         {
           "id": 13,
+          "type": "piechart",
+          "title": "Namespace GPU share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 16,
+            "y": 10
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 14,
           "type": "timeseries",
           "title": "Worker node CPU",
           "datasource": {
@@ -847,7 +889,7 @@ data:
           ]
         },
         {
-          "id": 14,
+          "id": 15,
           "type": "timeseries",
           "title": "Worker node RAM",
           "datasource": {
@@ -894,7 +936,7 @@ data:
           ]
         },
         {
-          "id": 15,
+          "id": 16,
           "type": "timeseries",
           "title": "Control plane CPU",
           "datasource": {
@@ -931,7 +973,7 @@ data:
           }
         },
         {
-          "id": 16,
+          "id": 17,
           "type": "timeseries",
           "title": "Control plane RAM",
           "datasource": {
@@ -968,7 +1010,7 @@ data:
           }
         },
         {
-          "id": 17,
+          "id": 18,
           "type": "timeseries",
           "title": "Cluster ingress throughput",
           "datasource": {
@@ -1011,7 +1053,7 @@ data:
           ]
         },
         {
-          "id": 18,
+          "id": 19,
           "type": "timeseries",
           "title": "Cluster egress throughput",
           "datasource": {
@@ -1054,7 +1096,7 @@ data:
           ]
         },
         {
-          "id": 19,
+          "id": 20,
           "type": "timeseries",
           "title": "Root filesystem usage",
           "datasource": {
@@ -1102,7 +1144,7 @@ data:
           ]
         },
         {
-          "id": 20,
+          "id": 21,
           "type": "bargauge",
           "title": "Nodes closest to full root disks",
           "datasource": {
@@ -1171,7 +1213,7 @@ data:
           ]
         },
         {
-          "id": 21,
+          "id": 22,
           "type": "stat",
           "title": "Astreae usage",
           "datasource": {
@@ -1242,7 +1284,7 @@ data:
           ]
         },
         {
-          "id": 22,
+          "id": 23,
           "type": "stat",
           "title": "Asteria usage",
           "datasource": {
@@ -1313,7 +1355,7 @@ data:
           ]
         },
         {
-          "id": 23,
+          "id": 24,
           "type": "stat",
           "title": "Astreae free",
           "datasource": {
@@ -1380,7 +1422,7 @@ data:
           ]
         },
         {
-          "id": 24,
+          "id": 25,
           "type": "stat",
           "title": "Asteria free",
           "datasource": {
@@ -1447,7 +1489,7 @@ data:
           ]
         },
         {
-          "id": 25,
+          "id": 26,
           "type": "text",
           "title": "About this dashboard",
           "gridPos": {

From f4dd1de43fccf81562606223d89dcb076fb187a2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 23:18:53 -0300
Subject: [PATCH 34/71] monitoring: reorder namespace pies and add gpu data

---
 scripts/render_dashboards.py                  | 28 ++++++++++++-------
 .../monitoring/dashboards/atlas-overview.json | 12 ++++----
 .../grafana-dashboard-overview.yaml           | 12 ++++----
 3 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 1248984..1c4aef2 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -167,17 +167,20 @@ def node_io_expr(scope=""):
 
 def namespace_cpu_share_expr():
     selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
+    total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
+    return f"100 * ( {selected} ) / {total}"
 
 
 def namespace_ram_share_expr():
     selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
+    total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
+    return f"100 * ( {selected} ) / {total}"
 
 
 def namespace_gpu_share_expr():
     selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )"
+    total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
+    return f"100 * ( {selected} ) / {total}"
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
@@ -222,12 +225,17 @@ NAMESPACE_RAM_RAW = (
 NAMESPACE_GPU_RAW = (
     'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
 )
+NAMESPACE_GPU_RAW = (
+    'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)'
+)
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
     + ") + ("
     + NAMESPACE_RAM_RAW
-    + ' / 1e9))'
+    + ' / 1e9) + ('
+    + NAMESPACE_GPU_RAW
+    + ' * 10))'
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
@@ -518,24 +526,24 @@ def build_overview():
     panels.append(
         pie_panel(
             11,
-            "Namespace CPU share",
-            namespace_cpu_share_expr(),
+            "Namespace GPU share",
+            namespace_gpu_share_expr(),
             {"h": 9, "w": 8, "x": 0, "y": 10},
         )
     )
     panels.append(
         pie_panel(
             12,
-            "Namespace RAM share",
-            namespace_ram_share_expr(),
+            "Namespace CPU share",
+            namespace_cpu_share_expr(),
             {"h": 9, "w": 8, "x": 8, "y": 10},
         )
     )
     panels.append(
         pie_panel(
             13,
-            "Namespace GPU share",
-            namespace_gpu_share_expr(),
+            "Namespace RAM share",
+            namespace_ram_share_expr(),
             {"h": 9, "w": 8, "x": 16, "y": 10},
         )
     )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 47aa5da..f833b89 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -709,7 +709,7 @@
     {
       "id": 11,
       "type": "piechart",
-      "title": "Namespace CPU share",
+      "title": "Namespace GPU share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+          "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -751,7 +751,7 @@
     {
       "id": 12,
       "type": "piechart",
-      "title": "Namespace RAM share",
+      "title": "Namespace CPU share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -793,7 +793,7 @@
     {
       "id": 13,
       "type": "piechart",
-      "title": "Namespace GPU share",
+      "title": "Namespace RAM share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 96136d7..fb4e13a 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -718,7 +718,7 @@ data:
         {
           "id": 11,
           "type": "piechart",
-          "title": "Namespace CPU share",
+          "title": "Namespace GPU share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+              "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -760,7 +760,7 @@ data:
         {
           "id": 12,
           "type": "piechart",
-          "title": "Namespace RAM share",
+          "title": "Namespace CPU share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -802,7 +802,7 @@ data:
         {
           "id": 13,
           "type": "piechart",
-          "title": "Namespace GPU share",
+          "title": "Namespace RAM share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From aef3176c1cf5e0196ecc0fa7641cde53c79c576b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 23:40:22 -0300
Subject: [PATCH 35/71] monitoring: fix hottest stats and gpu share

---
 scripts/render_dashboards.py                        | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 8 ++++----
 services/monitoring/grafana-dashboard-overview.yaml | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 1c4aef2..a09eeae 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -145,7 +145,7 @@ def astreae_free_expr(mount):
 
 
 def topk_with_node(expr):
-    return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
+    return f"topk(1, {expr})"
 
 
 def node_net_expr(scope=""):
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index f833b89..1bb0b53 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -438,7 +438,7 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -511,7 +511,7 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -584,7 +584,7 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -653,7 +653,7 @@
       },
       "targets": [
         {
-          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index fb4e13a..f2ef289 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -447,7 +447,7 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -520,7 +520,7 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -593,7 +593,7 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -662,7 +662,7 @@ data:
           },
           "targets": [
             {
-              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true

From beb3243839343349cfe2803ea9a8be634d9fc72c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 17 Nov 2025 23:42:55 -0300
Subject: [PATCH 36/71] Revert GPU pie chart additions

---
 scripts/render_dashboards.py                  | 66 +++++---------
 .../monitoring/dashboards/atlas-overview.json | 88 +++++--------------
 .../grafana-dashboard-overview.yaml           | 88 +++++--------------
 3 files changed, 67 insertions(+), 175 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index a09eeae..4e8e5a5 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -145,7 +145,7 @@ def astreae_free_expr(mount):
 
 
 def topk_with_node(expr):
-    return f"topk(1, {expr})"
+    return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
 
 
 def node_net_expr(scope=""):
@@ -167,20 +167,12 @@ def node_io_expr(scope=""):
 
 def namespace_cpu_share_expr():
     selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
+    return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
 
 
 def namespace_ram_share_expr():
     selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
-
-
-def namespace_gpu_share_expr():
-    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
+    return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
@@ -222,20 +214,12 @@ NAMESPACE_CPU_RAW = (
 NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
-NAMESPACE_GPU_RAW = (
-    'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
-)
-NAMESPACE_GPU_RAW = (
-    'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)'
-)
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
     + ") + ("
     + NAMESPACE_RAM_RAW
-    + ' / 1e9) + ('
-    + NAMESPACE_GPU_RAW
-    + ' * 10))'
+    + ' / 1e9))'
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
@@ -526,32 +510,24 @@ def build_overview():
     panels.append(
         pie_panel(
             11,
-            "Namespace GPU share",
-            namespace_gpu_share_expr(),
-            {"h": 9, "w": 8, "x": 0, "y": 10},
+            "Namespace CPU share",
+            namespace_cpu_share_expr(),
+            {"h": 9, "w": 12, "x": 0, "y": 10},
         )
     )
     panels.append(
         pie_panel(
             12,
-            "Namespace CPU share",
-            namespace_cpu_share_expr(),
-            {"h": 9, "w": 8, "x": 8, "y": 10},
-        )
-    )
-    panels.append(
-        pie_panel(
-            13,
             "Namespace RAM share",
             namespace_ram_share_expr(),
-            {"h": 9, "w": 8, "x": 16, "y": 10},
+            {"h": 9, "w": 12, "x": 12, "y": 10},
         )
     )
 
     worker_filter = f"{WORKER_REGEX}"
     panels.append(
         timeseries_panel(
-            14,
+            13,
             "Worker node CPU",
             node_cpu_expr(worker_filter),
             {"h": 8, "w": 12, "x": 0, "y": 19},
@@ -565,7 +541,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            15,
+            14,
             "Worker node RAM",
             node_mem_expr(worker_filter),
             {"h": 8, "w": 12, "x": 12, "y": 19},
@@ -580,7 +556,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            16,
+            15,
             "Control plane CPU",
             node_cpu_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 0, "y": 27},
@@ -592,7 +568,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            17,
+            16,
             "Control plane RAM",
             node_mem_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 12, "y": 27},
@@ -605,7 +581,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            18,
+            17,
             "Cluster ingress throughput",
             NET_INGRESS_EXPR,
             {"h": 7, "w": 12, "x": 0, "y": 34},
@@ -617,7 +593,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            19,
+            18,
             "Cluster egress throughput",
             NET_EGRESS_EXPR,
             {"h": 7, "w": 12, "x": 12, "y": 34},
@@ -630,7 +606,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            20,
+            19,
             "Root filesystem usage",
             root_usage_expr(),
             {"h": 8, "w": 12, "x": 0, "y": 41},
@@ -645,7 +621,7 @@ def build_overview():
     )
     panels.append(
         {
-            "id": 21,
+            "id": 20,
             "type": "bargauge",
             "title": "Nodes closest to full root disks",
             "datasource": PROM_DS,
@@ -679,10 +655,10 @@ def build_overview():
     )
 
     storage_panels = [
-        (22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
-        (23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
-        (24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
-        (25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
+        (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
+        (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
+        (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
+        (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
         panels.append(
@@ -699,7 +675,7 @@ def build_overview():
 
     panels.append(
         text_panel(
-            26,
+            25,
             "About this dashboard",
             textwrap.dedent(
                 """\
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 1bb0b53..55c1909 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -438,7 +438,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -511,7 +511,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -584,7 +584,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -653,7 +653,7 @@
       },
       "targets": [
         {
-          "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -709,20 +709,20 @@
     {
       "id": 11,
       "type": "piechart",
-      "title": "Namespace GPU share",
+      "title": "Namespace CPU share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 9,
-        "w": 8,
+        "w": 12,
         "x": 0,
         "y": 10
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -751,20 +751,20 @@
     {
       "id": 12,
       "type": "piechart",
-      "title": "Namespace CPU share",
+      "title": "Namespace RAM share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 9,
-        "w": 8,
-        "x": 8,
+        "w": 12,
+        "x": 12,
         "y": 10
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -792,48 +792,6 @@
     },
     {
       "id": 13,
-      "type": "piechart",
-      "title": "Namespace RAM share",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 8,
-        "x": 16,
-        "y": 10
-      },
-      "targets": [
-        {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
-          "refId": "A",
-          "legendFormat": "{{namespace}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "right"
-        },
-        "pieType": "pie",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      }
-    },
-    {
-      "id": 14,
       "type": "timeseries",
       "title": "Worker node CPU",
       "datasource": {
@@ -880,7 +838,7 @@
       ]
     },
     {
-      "id": 15,
+      "id": 14,
       "type": "timeseries",
       "title": "Worker node RAM",
       "datasource": {
@@ -927,7 +885,7 @@
       ]
     },
     {
-      "id": 16,
+      "id": 15,
       "type": "timeseries",
       "title": "Control plane CPU",
       "datasource": {
@@ -964,7 +922,7 @@
       }
     },
     {
-      "id": 17,
+      "id": 16,
       "type": "timeseries",
       "title": "Control plane RAM",
       "datasource": {
@@ -1001,7 +959,7 @@
       }
     },
     {
-      "id": 18,
+      "id": 17,
       "type": "timeseries",
       "title": "Cluster ingress throughput",
       "datasource": {
@@ -1044,7 +1002,7 @@
       ]
     },
     {
-      "id": 19,
+      "id": 18,
       "type": "timeseries",
       "title": "Cluster egress throughput",
       "datasource": {
@@ -1087,7 +1045,7 @@
       ]
     },
     {
-      "id": 20,
+      "id": 19,
       "type": "timeseries",
       "title": "Root filesystem usage",
       "datasource": {
@@ -1135,7 +1093,7 @@
       ]
     },
     {
-      "id": 21,
+      "id": 20,
       "type": "bargauge",
       "title": "Nodes closest to full root disks",
       "datasource": {
@@ -1204,7 +1162,7 @@
       ]
     },
     {
-      "id": 22,
+      "id": 21,
       "type": "stat",
       "title": "Astreae usage",
       "datasource": {
@@ -1275,7 +1233,7 @@
       ]
     },
     {
-      "id": 23,
+      "id": 22,
       "type": "stat",
       "title": "Asteria usage",
       "datasource": {
@@ -1346,7 +1304,7 @@
       ]
     },
     {
-      "id": 24,
+      "id": 23,
       "type": "stat",
       "title": "Astreae free",
       "datasource": {
@@ -1413,7 +1371,7 @@
       ]
     },
     {
-      "id": 25,
+      "id": 24,
       "type": "stat",
       "title": "Asteria free",
       "datasource": {
@@ -1480,7 +1438,7 @@
       ]
     },
     {
-      "id": 26,
+      "id": 25,
       "type": "text",
       "title": "About this dashboard",
       "gridPos": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index f2ef289..deeeacc 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -447,7 +447,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -520,7 +520,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -593,7 +593,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -662,7 +662,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -718,20 +718,20 @@ data:
         {
           "id": 11,
           "type": "piechart",
-          "title": "Namespace GPU share",
+          "title": "Namespace CPU share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 8,
+            "w": 12,
             "x": 0,
             "y": 10
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -760,20 +760,20 @@ data:
         {
           "id": 12,
           "type": "piechart",
-          "title": "Namespace CPU share",
+          "title": "Namespace RAM share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 8,
-            "x": 8,
+            "w": 12,
+            "x": 12,
             "y": 10
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -801,48 +801,6 @@ data:
         },
         {
           "id": 13,
-          "type": "piechart",
-          "title": "Namespace RAM share",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 8,
-            "x": 16,
-            "y": 10
-          },
-          "targets": [
-            {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
-              "refId": "A",
-              "legendFormat": "{{namespace}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "right"
-            },
-            "pieType": "pie",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          }
-        },
-        {
-          "id": 14,
           "type": "timeseries",
           "title": "Worker node CPU",
           "datasource": {
@@ -889,7 +847,7 @@ data:
           ]
         },
         {
-          "id": 15,
+          "id": 14,
           "type": "timeseries",
           "title": "Worker node RAM",
           "datasource": {
@@ -936,7 +894,7 @@ data:
           ]
         },
         {
-          "id": 16,
+          "id": 15,
           "type": "timeseries",
           "title": "Control plane CPU",
           "datasource": {
@@ -973,7 +931,7 @@ data:
           }
         },
         {
-          "id": 17,
+          "id": 16,
           "type": "timeseries",
           "title": "Control plane RAM",
           "datasource": {
@@ -1010,7 +968,7 @@ data:
           }
         },
         {
-          "id": 18,
+          "id": 17,
           "type": "timeseries",
           "title": "Cluster ingress throughput",
           "datasource": {
@@ -1053,7 +1011,7 @@ data:
           ]
         },
         {
-          "id": 19,
+          "id": 18,
           "type": "timeseries",
           "title": "Cluster egress throughput",
           "datasource": {
@@ -1096,7 +1054,7 @@ data:
           ]
         },
         {
-          "id": 20,
+          "id": 19,
           "type": "timeseries",
           "title": "Root filesystem usage",
           "datasource": {
@@ -1144,7 +1102,7 @@ data:
           ]
         },
         {
-          "id": 21,
+          "id": 20,
           "type": "bargauge",
           "title": "Nodes closest to full root disks",
           "datasource": {
@@ -1213,7 +1171,7 @@ data:
           ]
         },
         {
-          "id": 22,
+          "id": 21,
           "type": "stat",
           "title": "Astreae usage",
           "datasource": {
@@ -1284,7 +1242,7 @@ data:
           ]
         },
         {
-          "id": 23,
+          "id": 22,
           "type": "stat",
           "title": "Asteria usage",
           "datasource": {
@@ -1355,7 +1313,7 @@ data:
           ]
         },
         {
-          "id": 24,
+          "id": 23,
           "type": "stat",
           "title": "Astreae free",
           "datasource": {
@@ -1422,7 +1380,7 @@ data:
           ]
         },
         {
-          "id": 25,
+          "id": 24,
           "type": "stat",
           "title": "Asteria free",
           "datasource": {
@@ -1489,7 +1447,7 @@ data:
           ]
         },
         {
-          "id": 26,
+          "id": 25,
           "type": "text",
           "title": "About this dashboard",
           "gridPos": {

From 2ba642d49f92b20057ab914687830c3d6edf449c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 00:11:39 -0300
Subject: [PATCH 37/71] monitoring: add gpu pie and tidy net panels

---
 scripts/render_dashboards.py                  |  51 ++++--
 .../monitoring/dashboards/atlas-overview.json | 157 +++++++++++-------
 .../grafana-dashboard-overview.yaml           | 157 +++++++++++-------
 3 files changed, 239 insertions(+), 126 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 4e8e5a5..c194771 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -167,12 +167,20 @@ def node_io_expr(scope=""):
 
 def namespace_cpu_share_expr():
     selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
+    total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
+    return f"100 * ( {selected} ) / {total}"
 
 
 def namespace_ram_share_expr():
     selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
+    total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
+    return f"100 * ( {selected} ) / {total}"
+
+
+def namespace_gpu_share_expr():
+    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
+    return f"100 * ( {selected} ) / {total}"
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
@@ -214,12 +222,17 @@ NAMESPACE_CPU_RAW = (
 NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
+NAMESPACE_GPU_RAW = (
+    'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)'
+)
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
     + ") + ("
     + NAMESPACE_RAM_RAW
-    + ' / 1e9))'
+    + ' / 1e9) + ( '
+    + NAMESPACE_GPU_RAW
+    + ' * 10))'
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
@@ -512,22 +525,30 @@ def build_overview():
             11,
             "Namespace CPU share",
             namespace_cpu_share_expr(),
-            {"h": 9, "w": 12, "x": 0, "y": 10},
+            {"h": 9, "w": 8, "x": 0, "y": 10},
         )
     )
     panels.append(
         pie_panel(
             12,
+            "Namespace GPU share",
+            namespace_gpu_share_expr(),
+            {"h": 9, "w": 8, "x": 8, "y": 10},
+        )
+    )
+    panels.append(
+        pie_panel(
+            13,
             "Namespace RAM share",
             namespace_ram_share_expr(),
-            {"h": 9, "w": 12, "x": 12, "y": 10},
+            {"h": 9, "w": 8, "x": 16, "y": 10},
         )
     )
 
     worker_filter = f"{WORKER_REGEX}"
     panels.append(
         timeseries_panel(
-            13,
+            14,
             "Worker node CPU",
             node_cpu_expr(worker_filter),
             {"h": 8, "w": 12, "x": 0, "y": 19},
@@ -541,7 +562,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            14,
+            15,
             "Worker node RAM",
             node_mem_expr(worker_filter),
             {"h": 8, "w": 12, "x": 12, "y": 19},
@@ -556,7 +577,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            15,
+            16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 0, "y": 27},
@@ -568,7 +589,7 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            16,
+            17,
             "Control plane RAM",
             node_mem_expr(CONTROL_REGEX),
             {"h": 7, "w": 12, "x": 12, "y": 27},
@@ -581,11 +602,12 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            17,
+            18,
             "Cluster ingress throughput",
             NET_INGRESS_EXPR,
             {"h": 7, "w": 12, "x": 0, "y": 34},
             unit="Bps",
+            legend="Ingress",
             legend_display="list",
             legend_placement="bottom",
             links=link_to("atlas-network"),
@@ -593,11 +615,12 @@ def build_overview():
     )
     panels.append(
         timeseries_panel(
-            18,
+            19,
             "Cluster egress throughput",
             NET_EGRESS_EXPR,
             {"h": 7, "w": 12, "x": 12, "y": 34},
             unit="Bps",
+            legend="Egress",
             legend_display="list",
             legend_placement="bottom",
             links=link_to("atlas-network"),
@@ -606,7 +629,7 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            19,
+            20,
             "Root filesystem usage",
             root_usage_expr(),
             {"h": 8, "w": 12, "x": 0, "y": 41},
@@ -621,12 +644,12 @@ def build_overview():
     )
     panels.append(
         {
-            "id": 20,
+            "id": 21,
             "type": "bargauge",
             "title": "Nodes closest to full root disks",
             "datasource": PROM_DS,
             "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
-            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}],
+            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
             "fieldConfig": {
                 "defaults": {
                     "unit": "percent",
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 55c1909..0b2f69f 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -716,13 +716,13 @@
       },
       "gridPos": {
         "h": 9,
-        "w": 12,
+        "w": 8,
         "x": 0,
         "y": 10
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -751,20 +751,20 @@
     {
       "id": 12,
       "type": "piechart",
-      "title": "Namespace RAM share",
+      "title": "Namespace GPU share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 9,
-        "w": 12,
-        "x": 12,
+        "w": 8,
+        "x": 8,
         "y": 10
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
+          "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -792,6 +792,48 @@
     },
     {
       "id": 13,
+      "type": "piechart",
+      "title": "Namespace RAM share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 16,
+        "y": 10
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 14,
       "type": "timeseries",
       "title": "Worker node CPU",
       "datasource": {
@@ -838,7 +880,7 @@
       ]
     },
     {
-      "id": 14,
+      "id": 15,
       "type": "timeseries",
       "title": "Worker node RAM",
       "datasource": {
@@ -885,7 +927,7 @@
       ]
     },
     {
-      "id": 15,
+      "id": 16,
       "type": "timeseries",
       "title": "Control plane CPU",
       "datasource": {
@@ -922,7 +964,7 @@
       }
     },
     {
-      "id": 16,
+      "id": 17,
       "type": "timeseries",
       "title": "Control plane RAM",
       "datasource": {
@@ -959,7 +1001,7 @@
       }
     },
     {
-      "id": 17,
+      "id": 18,
       "type": "timeseries",
       "title": "Cluster ingress throughput",
       "datasource": {
@@ -975,50 +1017,8 @@
       "targets": [
         {
           "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "Bps"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "links": [
-        {
-          "title": "Open atlas-network dashboard",
-          "url": "/d/atlas-network",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 18,
-      "type": "timeseries",
-      "title": "Cluster egress throughput",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 34
-      },
-      "targets": [
-        {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "Ingress"
         }
       ],
       "fieldConfig": {
@@ -1047,6 +1047,50 @@
     {
       "id": 19,
       "type": "timeseries",
+      "title": "Cluster egress throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 34
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "refId": "A",
+          "legendFormat": "Egress"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-network dashboard",
+          "url": "/d/atlas-network",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 20,
+      "type": "timeseries",
       "title": "Root filesystem usage",
       "datasource": {
         "type": "prometheus",
@@ -1093,7 +1137,7 @@
       ]
     },
     {
-      "id": 20,
+      "id": 21,
       "type": "bargauge",
       "title": "Nodes closest to full root disks",
       "datasource": {
@@ -1109,7 +1153,8 @@
       "targets": [
         {
           "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-          "refId": "A"
+          "refId": "A",
+          "legendFormat": "{{node}}"
         }
       ],
       "fieldConfig": {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index deeeacc..0ac79db 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -725,13 +725,13 @@ data:
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
+            "w": 8,
             "x": 0,
             "y": 10
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -760,20 +760,20 @@ data:
         {
           "id": 12,
           "type": "piechart",
-          "title": "Namespace RAM share",
+          "title": "Namespace GPU share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 9,
-            "w": 12,
-            "x": 12,
+            "w": 8,
+            "x": 8,
             "y": 10
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
+              "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -801,6 +801,48 @@ data:
         },
         {
           "id": 13,
+          "type": "piechart",
+          "title": "Namespace RAM share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 16,
+            "y": 10
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 14,
           "type": "timeseries",
           "title": "Worker node CPU",
           "datasource": {
@@ -847,7 +889,7 @@ data:
           ]
         },
         {
-          "id": 14,
+          "id": 15,
           "type": "timeseries",
           "title": "Worker node RAM",
           "datasource": {
@@ -894,7 +936,7 @@ data:
           ]
         },
         {
-          "id": 15,
+          "id": 16,
           "type": "timeseries",
           "title": "Control plane CPU",
           "datasource": {
@@ -931,7 +973,7 @@ data:
           }
         },
         {
-          "id": 16,
+          "id": 17,
           "type": "timeseries",
           "title": "Control plane RAM",
           "datasource": {
@@ -968,7 +1010,7 @@ data:
           }
         },
         {
-          "id": 17,
+          "id": 18,
           "type": "timeseries",
           "title": "Cluster ingress throughput",
           "datasource": {
@@ -984,50 +1026,8 @@ data:
           "targets": [
             {
               "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "Bps"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
-          "links": [
-            {
-              "title": "Open atlas-network dashboard",
-              "url": "/d/atlas-network",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 18,
-          "type": "timeseries",
-          "title": "Cluster egress throughput",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 12,
-            "y": 34
-          },
-          "targets": [
-            {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "Ingress"
             }
           ],
           "fieldConfig": {
@@ -1056,6 +1056,50 @@ data:
         {
           "id": 19,
           "type": "timeseries",
+          "title": "Cluster egress throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 34
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "refId": "A",
+              "legendFormat": "Egress"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "Bps"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-network dashboard",
+              "url": "/d/atlas-network",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 20,
+          "type": "timeseries",
           "title": "Root filesystem usage",
           "datasource": {
             "type": "prometheus",
@@ -1102,7 +1146,7 @@ data:
           ]
         },
         {
-          "id": 20,
+          "id": 21,
           "type": "bargauge",
           "title": "Nodes closest to full root disks",
           "datasource": {
@@ -1118,7 +1162,8 @@ data:
           "targets": [
             {
               "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-              "refId": "A"
+              "refId": "A",
+              "legendFormat": "{{node}}"
             }
           ],
           "fieldConfig": {

From ac62387e07956a0d31f66f3d7c7c34b9fdc908d7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 00:19:45 -0300
Subject: [PATCH 38/71] monitoring: stabilize namespace pies and labels

---
 scripts/render_dashboards.py                  | 19 ++++++++++++++-----
 .../monitoring/dashboards/atlas-overview.json |  9 ++++-----
 .../grafana-dashboard-overview.yaml           |  9 ++++-----
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index c194771..d6436ce 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -178,7 +178,10 @@ def namespace_ram_share_expr():
 
 
 def namespace_gpu_share_expr():
-    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    selected = (
+        f"(( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} ))"
+        f" or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)"
+    )
     total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
@@ -225,14 +228,21 @@ NAMESPACE_RAM_RAW = (
 NAMESPACE_GPU_RAW = (
     'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)'
 )
+NAMESPACE_GPU_WEIGHT = (
+    "(( "
+    + NAMESPACE_GPU_RAW
+    + " ) or on(namespace) ( "
+    + NAMESPACE_CPU_RAW
+    + " * 0))"
+)
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
     + ") + ("
     + NAMESPACE_RAM_RAW
-    + ' / 1e9) + ( '
-    + NAMESPACE_GPU_RAW
-    + ' * 10))'
+    + ' / 1e9) + ('
+    + NAMESPACE_GPU_WEIGHT
+    + " * 10))"
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 NET_INGRESS_EXPR = (
@@ -664,7 +674,6 @@ def build_overview():
                             {"color": "red", "value": 85},
                         ],
                     },
-                    "displayName": "{{node}}",
                 },
                 "overrides": [],
             },
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 0b2f69f..7d808c9 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
+          "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1182,8 +1182,7 @@
                 "value": 85
               }
             ]
-          },
-          "displayName": "{{node}}"
+          }
         },
         "overrides": []
       },
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 0ac79db..02b65f3 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
+              "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1191,8 +1191,7 @@ data:
                     "value": 85
                   }
                 ]
-              },
-              "displayName": "{{node}}"
+              }
             },
             "overrides": []
           },

From 5144bbe1f290fc9a2a98d50d8f3f3008894df1bb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 00:31:51 -0300
Subject: [PATCH 39/71] monitoring: fix gpu pie data and network panels

---
 scripts/render_dashboards.py                   | 18 +++++-------------
 .../monitoring/dashboards/atlas-network.json   |  4 ++--
 .../monitoring/dashboards/atlas-overview.json  | 10 +++++-----
 .../monitoring/grafana-dashboard-network.yaml  |  4 ++--
 .../monitoring/grafana-dashboard-overview.yaml | 10 +++++-----
 5 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index d6436ce..fbed073 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -178,10 +178,7 @@ def namespace_ram_share_expr():
 
 
 def namespace_gpu_share_expr():
-    selected = (
-        f"(( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} ))"
-        f" or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)"
-    )
+    selected = f"(( {NAMESPACE_GPU_RAW} ) or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
     total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
@@ -226,7 +223,8 @@ NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
 NAMESPACE_GPU_RAW = (
-    'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)'
+    'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"} '
+    'or kube_pod_resource_request{namespace!="",resource="nvidia.com/gpu"}) by (namespace)'
 )
 NAMESPACE_GPU_WEIGHT = (
     "(( "
@@ -245,14 +243,8 @@ NAMESPACE_COMBINED_FILTER = (
     + " * 10))"
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
-NET_INGRESS_EXPR = (
-    'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
-    "or on() vector(0)"
-)
-NET_EGRESS_EXPR = (
-    'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) '
-    "or on() vector(0)"
-)
+NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
+NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
 
 # ---------------------------------------------------------------------------
 # Panel factories
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 0363b81..27da627 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -80,7 +80,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 7d808c9..eb3f11d 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
+          "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1016,7 +1016,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Ingress"
         }
@@ -1060,7 +1060,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Egress"
         }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 2d7d989..1b70159 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -89,7 +89,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 02b65f3..e7ddd48 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
+              "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1025,7 +1025,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Ingress"
             }
@@ -1069,7 +1069,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Egress"
             }

From ec76563a8677cbf5926dcafb13014af8fac028e5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 01:01:10 -0300
Subject: [PATCH 40/71] monitoring: source gpu pie from limits and node nets

---
 scripts/render_dashboards.py                        | 7 ++++---
 services/monitoring/dashboards/atlas-overview.json  | 6 +++---
 services/monitoring/grafana-dashboard-overview.yaml | 6 +++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index fbed073..0916969 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -178,7 +178,7 @@ def namespace_ram_share_expr():
 
 
 def namespace_gpu_share_expr():
-    selected = f"(( {NAMESPACE_GPU_RAW} ) or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
     total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
@@ -223,8 +223,9 @@ NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
 NAMESPACE_GPU_RAW = (
-    'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"} '
-    'or kube_pod_resource_request{namespace!="",resource="nvidia.com/gpu"}) by (namespace)'
+    'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
+    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})'
+    ') by (namespace)'
 )
 NAMESPACE_GPU_WEIGHT = (
     "(( "
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index eb3f11d..f6d42c1 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index e7ddd48..bf7b710 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From d7e4bcd53315d5231c0760635479173e76d5c526 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 10:47:24 -0300
Subject: [PATCH 41/71] monitoring: add gpu node fallback

---
 scripts/render_dashboards.py                  | 24 ++++++++++++-------
 .../monitoring/dashboards/atlas-overview.json |  6 ++---
 .../grafana-dashboard-overview.yaml           |  6 ++---
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 0916969..44a0de1 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -222,18 +222,24 @@ NAMESPACE_CPU_RAW = (
 NAMESPACE_RAM_RAW = (
     'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
-NAMESPACE_GPU_RAW = (
+GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
+GPU_NODE_REGEX = "|".join(GPU_NODES)
+NAMESPACE_GPU_REQUEST = (
     'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
-    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})'
-    ') by (namespace)'
+    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
-NAMESPACE_GPU_WEIGHT = (
-    "(( "
-    + NAMESPACE_GPU_RAW
-    + " ) or on(namespace) ( "
-    + NAMESPACE_CPU_RAW
-    + " * 0))"
+NAMESPACE_GPU_FALLBACK = (
+    'sum by (namespace) (kube_pod_info{namespace!=""}'
+    f' and on(node) kube_node_info{{node=~"{GPU_NODE_REGEX}"}})'
 )
+NAMESPACE_GPU_RAW = (
+    "("
+    + NAMESPACE_GPU_REQUEST
+    + ") or on(namespace) group_left() ("
+    + NAMESPACE_GPU_FALLBACK
+    + ")"
+)
+NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index f6d42c1..11634d9 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)",
+          "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index bf7b710..f243cf8 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)",
+              "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From 7009a4f9ff25b908f38c9046a8a15929b436d5ae Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 11:12:03 -0300
Subject: [PATCH 42/71] monitoring: fix namespace gpu share and network stats

---
 scripts/render_dashboards.py                     | 14 ++++++++------
 .../monitoring/dashboards/atlas-network.json     |  4 ++--
 .../monitoring/dashboards/atlas-overview.json    | 16 ++++++++--------
 .../monitoring/grafana-dashboard-network.yaml    |  4 ++--
 .../monitoring/grafana-dashboard-overview.yaml   | 16 ++++++++--------
 5 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 44a0de1..b53c8c7 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -178,7 +178,9 @@ def namespace_ram_share_expr():
 
 
 def namespace_gpu_share_expr():
-    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
+    selected = (
+        f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
+    )
     total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
@@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = (
     + " * 10))"
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
-NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
-NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
+NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
+NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
 
 # ---------------------------------------------------------------------------
 # Panel factories
@@ -471,7 +473,7 @@ def build_overview():
             thresholds = {
                 "mode": "absolute",
                 "steps": [
-                    {"color": "red", "value": 0},
+                    {"color": "red", "value": None},
                     {"color": "orange", "value": WORKER_TOTAL - 2},
                     {"color": "yellow", "value": WORKER_TOTAL - 1},
                     {"color": "green", "value": WORKER_TOTAL},
@@ -481,7 +483,7 @@ def build_overview():
             thresholds = {
                 "mode": "absolute",
                 "steps": [
-                    {"color": "red", "value": 0},
+                    {"color": "red", "value": None},
                     {"color": "green", "value": CONTROL_TOTAL},
                 ],
             }
@@ -489,7 +491,7 @@ def build_overview():
             thresholds = {
                 "mode": "absolute",
                 "steps": [
-                    {"color": "green", "value": 0},
+                    {"color": "green", "value": None},
                     {"color": "yellow", "value": 1},
                     {"color": "orange", "value": 2},
                     {"color": "red", "value": 3},
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 27da627..a4daa0c 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
           "refId": "A"
         }
       ],
@@ -80,7 +80,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 11634d9..16e01b3 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -38,7 +38,7 @@
             "steps": [
               {
                 "color": "red",
-                "value": 0
+                "value": null
               },
               {
                 "color": "orange",
@@ -107,7 +107,7 @@
             "steps": [
               {
                 "color": "red",
-                "value": 0
+                "value": null
               },
               {
                 "color": "green",
@@ -168,7 +168,7 @@
             "steps": [
               {
                 "color": "green",
-                "value": 0
+                "value": null
               },
               {
                 "color": "yellow",
@@ -243,7 +243,7 @@
             "steps": [
               {
                 "color": "green",
-                "value": 0
+                "value": null
               },
               {
                 "color": "yellow",
@@ -318,7 +318,7 @@
             "steps": [
               {
                 "color": "green",
-                "value": 0
+                "value": null
               },
               {
                 "color": "yellow",
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
+          "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1016,7 +1016,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
           "refId": "A",
           "legendFormat": "Ingress"
         }
@@ -1060,7 +1060,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
           "refId": "A",
           "legendFormat": "Egress"
         }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 1b70159..029987a 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
               "refId": "A"
             }
           ],
@@ -89,7 +89,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index f243cf8..55afe64 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -47,7 +47,7 @@ data:
                 "steps": [
                   {
                     "color": "red",
-                    "value": 0
+                    "value": null
                   },
                   {
                     "color": "orange",
@@ -116,7 +116,7 @@ data:
                 "steps": [
                   {
                     "color": "red",
-                    "value": 0
+                    "value": null
                   },
                   {
                     "color": "green",
@@ -177,7 +177,7 @@ data:
                 "steps": [
                   {
                     "color": "green",
-                    "value": 0
+                    "value": null
                   },
                   {
                     "color": "yellow",
@@ -252,7 +252,7 @@ data:
                 "steps": [
                   {
                     "color": "green",
-                    "value": 0
+                    "value": null
                   },
                   {
                     "color": "yellow",
@@ -327,7 +327,7 @@ data:
                 "steps": [
                   {
                     "color": "green",
-                    "value": 0
+                    "value": null
                   },
                   {
                     "color": "yellow",
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
+              "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1025,7 +1025,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
               "refId": "A",
               "legendFormat": "Ingress"
             }
@@ -1069,7 +1069,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
               "refId": "A",
               "legendFormat": "Egress"
             }

From fab5552039b4ee3c4d72e76203c7fe40bac6cee0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 11:30:33 -0300
Subject: [PATCH 43/71] monitoring: resolve pie errors and network data

---
 scripts/render_dashboards.py                  | 21 ++++++++++++-------
 .../monitoring/dashboards/atlas-network.json  |  4 ++--
 .../monitoring/dashboards/atlas-overview.json | 10 ++++-----
 .../monitoring/grafana-dashboard-network.yaml |  4 ++--
 .../grafana-dashboard-overview.yaml           | 10 ++++-----
 5 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index b53c8c7..8e9bc8a 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -178,9 +178,7 @@ def namespace_ram_share_expr():
 
 
 def namespace_gpu_share_expr():
-    selected = (
-        f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
-    )
+    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
     total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
@@ -231,13 +229,14 @@ NAMESPACE_GPU_REQUEST = (
     ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
 NAMESPACE_GPU_FALLBACK = (
-    'sum by (namespace) (kube_pod_info{namespace!=""}'
-    f' and on(node) kube_node_info{{node=~"{GPU_NODE_REGEX}"}})'
+    'sum by (namespace) (kube_pod_info{namespace!="",node=~"'
+    + GPU_NODE_REGEX
+    + '"})'
 )
 NAMESPACE_GPU_RAW = (
     "("
     + NAMESPACE_GPU_REQUEST
-    + ") or on(namespace) group_left() ("
+    + ") or on(namespace) ("
     + NAMESPACE_GPU_FALLBACK
     + ")"
 )
@@ -252,8 +251,14 @@ NAMESPACE_COMBINED_FILTER = (
     + " * 10))"
 )
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
-NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
-NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
+NET_INGRESS_EXPR = (
+    'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
+    " or on() vector(0)"
+)
+NET_EGRESS_EXPR = (
+    'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
+    " or on() vector(0)"
+)
 
 # ---------------------------------------------------------------------------
 # Panel factories
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index a4daa0c..098e1db 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -80,7 +80,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 16e01b3..5772c2c 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
+          "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1016,7 +1016,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Ingress"
         }
@@ -1060,7 +1060,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Egress"
         }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 029987a..a552793 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -89,7 +89,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 55afe64..00755a9 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
+              "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1025,7 +1025,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Ingress"
             }
@@ -1069,7 +1069,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Egress"
             }

From 497164a1ad4b49632414f83e54c89cac97049543 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 11:42:24 -0300
Subject: [PATCH 44/71] monitoring: clean namespace gpu share and layout

---
 scripts/render_dashboards.py                  | 29 ++++---------------
 .../monitoring/dashboards/atlas-overview.json |  6 ++--
 .../grafana-dashboard-overview.yaml           |  6 ++--
 3 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 8e9bc8a..bce5bfe 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -224,21 +224,19 @@ NAMESPACE_RAM_RAW = (
 )
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
-NAMESPACE_GPU_REQUEST = (
+NAMESPACE_GPU_ALLOC = (
     'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
     ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
-NAMESPACE_GPU_FALLBACK = (
-    'sum by (namespace) (kube_pod_info{namespace!="",node=~"'
-    + GPU_NODE_REGEX
-    + '"})'
+NAMESPACE_GPU_USAGE = (
+    'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)'
 )
 NAMESPACE_GPU_RAW = (
     "("
-    + NAMESPACE_GPU_REQUEST
+    + NAMESPACE_GPU_USAGE
     + ") or on(namespace) ("
-    + NAMESPACE_GPU_FALLBACK
-    + ")"
+    + NAMESPACE_GPU_ALLOC
+    + " * 0)"
 )
 NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW
 NAMESPACE_COMBINED_FILTER = (
@@ -711,21 +709,6 @@ def build_overview():
             )
         )
 
-    panels.append(
-        text_panel(
-            25,
-            "About this dashboard",
-            textwrap.dedent(
-                """\
-### Atlas Overview
-- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.
-- Control plane workload count flags any non-system pods that slipped onto the HA nodes.
-- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."""
-            ),
-            {"h": 5, "w": 24, "x": 0, "y": 55},
-        )
-    )
-
     return {
         "uid": "atlas-overview",
         "title": "Atlas Overview",
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 5772c2c..5953697 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -722,7 +722,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +764,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
+          "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +806,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 00755a9..f8b40af 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -731,7 +731,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +773,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
+              "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +815,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From 8e6c0a3cfefd5ce7e30030f42ba47b64eb74c6d8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 12:11:47 -0300
Subject: [PATCH 45/71] monitoring: rework gpu share + gauges

---
 scripts/render_dashboards.py                  |  63 +++++++-
 .../monitoring/dashboards/atlas-overview.json | 144 +++++-------------
 .../grafana-dashboard-overview.yaml           | 144 +++++-------------
 3 files changed, 144 insertions(+), 207 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index bce5bfe..937dfb7 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -238,7 +238,7 @@ NAMESPACE_GPU_RAW = (
     + NAMESPACE_GPU_ALLOC
     + " * 0)"
 )
-NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW
+NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
 NAMESPACE_COMBINED_FILTER = (
     'topk(10, ('
     + NAMESPACE_CPU_RAW
@@ -319,6 +319,49 @@ def stat_panel(
     return panel
 
 
+def gauge_panel(
+    panel_id,
+    title,
+    expr,
+    grid,
+    *,
+    min_value=0,
+    max_value=1,
+    thresholds=None,
+    links=None,
+):
+    return {
+        "id": panel_id,
+        "type": "gauge",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A"}],
+        "fieldConfig": {
+            "defaults": {
+                "min": min_value,
+                "max": max_value,
+                "thresholds": thresholds
+                or {
+                    "mode": "absolute",
+                    "steps": [
+                        {"color": "green", "value": None},
+                        {"color": "red", "value": max_value},
+                    ],
+                },
+            },
+            "overrides": [],
+        },
+        "options": {
+            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
+            "orientation": "auto",
+            "showThresholdMarkers": False,
+            "showThresholdLabels": False,
+        },
+        **({"links": links} if links else {}),
+    }
+
+
 def timeseries_panel(
     panel_id,
     title,
@@ -472,7 +515,10 @@ def build_overview():
     ]
     for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
         thresholds = None
+        min_value = 0
+        max_value = ok_value or 5
         if panel_id == 1:
+            max_value = WORKER_TOTAL
             thresholds = {
                 "mode": "absolute",
                 "steps": [
@@ -483,6 +529,7 @@ def build_overview():
                 ],
             }
         elif panel_id == 2:
+            max_value = CONTROL_TOTAL
             thresholds = {
                 "mode": "absolute",
                 "steps": [
@@ -491,6 +538,7 @@ def build_overview():
                 ],
             }
         elif panel_id in (3, 4, 5):
+            max_value = 4
             thresholds = {
                 "mode": "absolute",
                 "steps": [
@@ -500,13 +548,22 @@ def build_overview():
                     {"color": "red", "value": 3},
                 ],
             }
+        else:
+            thresholds = {
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "red", "value": max_value},
+                ],
+            }
         panels.append(
-            stat_panel(
+            gauge_panel(
                 panel_id,
                 title,
                 expr,
                 {"h": 5, "w": 4, "x": 4 * idx, "y": 0},
-                value_suffix=suffix,
+                min_value=min_value,
+                max_value=max_value,
                 thresholds=thresholds,
                 links=links,
             )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 5953697..ad460bb 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -9,7 +9,7 @@
   "panels": [
     {
       "id": 1,
-      "type": "stat",
+      "type": "gauge",
       "title": "Workers ready",
       "datasource": {
         "type": "prometheus",
@@ -29,10 +29,8 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
+          "min": 0,
+          "max": 18,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -53,19 +51,11 @@
                 "value": 18
               }
             ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto",
-            "valueSuffix": "/18"
           }
         },
         "overrides": []
       },
       "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -73,12 +63,14 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value"
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
       }
     },
     {
       "id": 2,
-      "type": "stat",
+      "type": "gauge",
       "title": "Control plane ready",
       "datasource": {
         "type": "prometheus",
@@ -98,10 +90,8 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
+          "min": 0,
+          "max": 3,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -114,19 +104,11 @@
                 "value": 3
               }
             ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto",
-            "valueSuffix": "/3"
           }
         },
         "overrides": []
       },
       "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -134,12 +116,14 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value"
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
       }
     },
     {
       "id": 3,
-      "type": "stat",
+      "type": "gauge",
       "title": "Control plane workloads",
       "datasource": {
         "type": "prometheus",
@@ -159,10 +143,8 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
+          "min": 0,
+          "max": 4,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -183,18 +165,11 @@
                 "value": 3
               }
             ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -202,7 +177,9 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value"
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
       },
       "links": [
         {
@@ -214,7 +191,7 @@
     },
     {
       "id": 4,
-      "type": "stat",
+      "type": "gauge",
       "title": "Problem pods",
       "datasource": {
         "type": "prometheus",
@@ -234,10 +211,8 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
+          "min": 0,
+          "max": 4,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -258,18 +233,11 @@
                 "value": 3
               }
             ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -277,7 +245,9 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value"
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
       },
       "links": [
         {
@@ -289,7 +259,7 @@
     },
     {
       "id": 5,
-      "type": "stat",
+      "type": "gauge",
       "title": "Stuck terminating",
       "datasource": {
         "type": "prometheus",
@@ -309,10 +279,8 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
+          "min": 0,
+          "max": 4,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -333,18 +301,11 @@
                 "value": 3
               }
             ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -352,7 +313,9 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value"
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
       },
       "links": [
         {
@@ -364,7 +327,7 @@
     },
     {
       "id": 6,
-      "type": "stat",
+      "type": "gauge",
       "title": "Running pods",
       "datasource": {
         "type": "prometheus",
@@ -384,34 +347,25 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "mappings": [],
+          "min": 0,
+          "max": 5,
           "thresholds": {
             "mode": "absolute",
             "steps": [
               {
-                "color": "rgba(115, 115, 115, 1)",
+                "color": "green",
                 "value": null
               },
               {
-                "color": "green",
-                "value": 1
+                "color": "red",
+                "value": 5
               }
             ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -419,7 +373,9 @@
           "fields": "",
           "values": false
         },
-        "textMode": "value"
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
       }
     },
     {
@@ -722,7 +678,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -764,7 +720,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
+          "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -806,7 +762,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1480,22 +1436,6 @@
           "targetBlank": true
         }
       ]
-    },
-    {
-      "id": 25,
-      "type": "text",
-      "title": "About this dashboard",
-      "gridPos": {
-        "h": 5,
-        "w": 24,
-        "x": 0,
-        "y": 55
-      },
-      "datasource": null,
-      "options": {
-        "mode": "markdown",
-        "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
-      }
     }
   ],
   "schemaVersion": 39,
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index f8b40af..6503da9 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -18,7 +18,7 @@ data:
       "panels": [
         {
           "id": 1,
-          "type": "stat",
+          "type": "gauge",
           "title": "Workers ready",
           "datasource": {
             "type": "prometheus",
@@ -38,10 +38,8 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
+              "min": 0,
+              "max": 18,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -62,19 +60,11 @@ data:
                     "value": 18
                   }
                 ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto",
-                "valueSuffix": "/18"
               }
             },
             "overrides": []
           },
           "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -82,12 +72,14 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
           }
         },
         {
           "id": 2,
-          "type": "stat",
+          "type": "gauge",
           "title": "Control plane ready",
           "datasource": {
             "type": "prometheus",
@@ -107,10 +99,8 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
+              "min": 0,
+              "max": 3,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -123,19 +113,11 @@ data:
                     "value": 3
                   }
                 ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto",
-                "valueSuffix": "/3"
               }
             },
             "overrides": []
           },
           "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -143,12 +125,14 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
           }
         },
         {
           "id": 3,
-          "type": "stat",
+          "type": "gauge",
           "title": "Control plane workloads",
           "datasource": {
             "type": "prometheus",
@@ -168,10 +152,8 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
+              "min": 0,
+              "max": 4,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -192,18 +174,11 @@ data:
                     "value": 3
                   }
                 ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -211,7 +186,9 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
           },
           "links": [
             {
@@ -223,7 +200,7 @@ data:
         },
         {
           "id": 4,
-          "type": "stat",
+          "type": "gauge",
           "title": "Problem pods",
           "datasource": {
             "type": "prometheus",
@@ -243,10 +220,8 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
+              "min": 0,
+              "max": 4,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -267,18 +242,11 @@ data:
                     "value": 3
                   }
                 ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -286,7 +254,9 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
           },
           "links": [
             {
@@ -298,7 +268,7 @@ data:
         },
         {
           "id": 5,
-          "type": "stat",
+          "type": "gauge",
           "title": "Stuck terminating",
           "datasource": {
             "type": "prometheus",
@@ -318,10 +288,8 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
+              "min": 0,
+              "max": 4,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -342,18 +310,11 @@ data:
                     "value": 3
                   }
                 ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -361,7 +322,9 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
           },
           "links": [
             {
@@ -373,7 +336,7 @@ data:
         },
         {
           "id": 6,
-          "type": "stat",
+          "type": "gauge",
           "title": "Running pods",
           "datasource": {
             "type": "prometheus",
@@ -393,34 +356,25 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "color": {
-                "mode": "palette-classic"
-              },
-              "mappings": [],
+              "min": 0,
+              "max": 5,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "rgba(115, 115, 115, 1)",
+                    "color": "green",
                     "value": null
                   },
                   {
-                    "color": "green",
-                    "value": 1
+                    "color": "red",
+                    "value": 5
                   }
                 ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -428,7 +382,9 @@ data:
               "fields": "",
               "values": false
             },
-            "textMode": "value"
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
           }
         },
         {
@@ -731,7 +687,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -773,7 +729,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
+              "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -815,7 +771,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1489,22 +1445,6 @@ data:
               "targetBlank": true
             }
           ]
-        },
-        {
-          "id": 25,
-          "type": "text",
-          "title": "About this dashboard",
-          "gridPos": {
-            "h": 5,
-            "w": 24,
-            "x": 0,
-            "y": 55
-          },
-          "datasource": null,
-          "options": {
-            "mode": "markdown",
-            "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
-          }
         }
       ],
       "schemaVersion": 39,

From ff056551c7d6fb7cc64dfed73680f8b7a13421e5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 14:08:33 -0300
Subject: [PATCH 46/71] monitoring: refresh overview dashboards

---
 scripts/render_dashboards.py                  |  182 +--
 .../monitoring/dashboards/atlas-network.json  |   86 +-
 .../monitoring/dashboards/atlas-overview.json | 1150 +++++++++--------
 services/monitoring/dcgm-exporter.yaml        |   74 ++
 .../monitoring/grafana-dashboard-network.yaml |   86 +-
 .../grafana-dashboard-overview.yaml           | 1150 +++++++++--------
 services/monitoring/kustomization.yaml        |    1 +
 7 files changed, 1511 insertions(+), 1218 deletions(-)
 create mode 100644 services/monitoring/dcgm-exporter.yaml

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 937dfb7..273090a 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -165,22 +165,22 @@ def node_io_expr(scope=""):
     return scoped_node_expr(base, scope)
 
 
-def namespace_cpu_share_expr():
-    selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
+def namespace_share_expr(resource_expr):
+    selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
+    total = f"clamp_min(sum( {resource_expr} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
 
+def namespace_cpu_share_expr():
+    return namespace_share_expr(NAMESPACE_CPU_RAW)
+
+
 def namespace_ram_share_expr():
-    selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
+    return namespace_share_expr(NAMESPACE_RAM_RAW)
 
 
 def namespace_gpu_share_expr():
-    selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
-    total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
+    return namespace_share_expr(NAMESPACE_GPU_RAW)
 
 
 PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
@@ -228,35 +228,47 @@ NAMESPACE_GPU_ALLOC = (
     'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
     ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
-NAMESPACE_GPU_USAGE = (
-    'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)'
-)
+NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
 NAMESPACE_GPU_RAW = (
     "("
     + NAMESPACE_GPU_USAGE
     + ") or on(namespace) ("
-    + NAMESPACE_GPU_ALLOC
+    + NAMESPACE_CPU_RAW
     + " * 0)"
 )
-NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
-NAMESPACE_COMBINED_FILTER = (
-    'topk(10, ('
+NAMESPACE_GPU_WEIGHT = (
+    "("
+    + NAMESPACE_GPU_ALLOC
+    + ") or on(namespace) ("
     + NAMESPACE_CPU_RAW
-    + ") + ("
-    + NAMESPACE_RAM_RAW
-    + ' / 1e9) + ('
-    + NAMESPACE_GPU_WEIGHT
-    + " * 10))"
+    + " * 0)"
 )
+NAMESPACE_ACTIVITY_SCORE = (
+    "( "
+    + NAMESPACE_CPU_RAW
+    + " ) + ("
+    + NAMESPACE_RAM_RAW
+    + " / 1e9) + ("
+    + NAMESPACE_GPU_WEIGHT
+    + " * 100)"
+)
+NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
-NET_INGRESS_EXPR = (
-    'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
+TRAEFIK_NET_INGRESS = (
+    'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
     " or on() vector(0)"
 )
-NET_EGRESS_EXPR = (
+TRAEFIK_NET_EGRESS = (
+    'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
+    " or on() vector(0)"
+)
+NET_TOTAL_EXPR = (
     'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
     " or on() vector(0)"
 )
+NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS
+NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS
+NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)"
 
 # ---------------------------------------------------------------------------
 # Panel factories
@@ -438,10 +450,20 @@ def pie_panel(panel_id, title, expr, grid):
         "datasource": PROM_DS,
         "gridPos": grid,
         "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
-        "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
+        "fieldConfig": {
+            "defaults": {
+                "unit": "percent",
+                "color": {"mode": "palette-classic"},
+            },
+            "overrides": [],
+        },
         "options": {
             "legend": {"displayMode": "list", "placement": "right"},
             "pieType": "pie",
+            "displayLabels": ["percent"],
+            "tooltip": {"mode": "single"},
+            "colorScheme": "interpolateSpectral",
+            "colorBy": "value",
             "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
         },
     }
@@ -511,7 +533,6 @@ def build_overview():
             1,
             link_to("atlas-pods"),
         ),
-        (6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
     ]
     for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
         thresholds = None
@@ -591,12 +612,31 @@ def build_overview():
             )
         )
 
+    storage_panels = [
+        (23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
+        (24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
+        (25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
+        (26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
+    ]
+    for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
+        panels.append(
+            stat_panel(
+                panel_id,
+                title,
+                expr,
+                {"h": 6, "w": 6, "x": 6 * idx, "y": 10},
+                unit=unit,
+                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
+                links=link_to("atlas-storage"),
+            )
+        )
+
     panels.append(
         pie_panel(
             11,
             "Namespace CPU share",
             namespace_cpu_share_expr(),
-            {"h": 9, "w": 8, "x": 0, "y": 10},
+            {"h": 9, "w": 8, "x": 0, "y": 16},
         )
     )
     panels.append(
@@ -604,7 +644,7 @@ def build_overview():
             12,
             "Namespace GPU share",
             namespace_gpu_share_expr(),
-            {"h": 9, "w": 8, "x": 8, "y": 10},
+            {"h": 9, "w": 8, "x": 8, "y": 16},
         )
     )
     panels.append(
@@ -612,7 +652,7 @@ def build_overview():
             13,
             "Namespace RAM share",
             namespace_ram_share_expr(),
-            {"h": 9, "w": 8, "x": 16, "y": 10},
+            {"h": 9, "w": 8, "x": 16, "y": 16},
         )
     )
 
@@ -622,7 +662,7 @@ def build_overview():
             14,
             "Worker node CPU",
             node_cpu_expr(worker_filter),
-            {"h": 8, "w": 12, "x": 0, "y": 19},
+            {"h": 8, "w": 12, "x": 0, "y": 25},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -636,7 +676,7 @@ def build_overview():
             15,
             "Worker node RAM",
             node_mem_expr(worker_filter),
-            {"h": 8, "w": 12, "x": 12, "y": 19},
+            {"h": 8, "w": 12, "x": 12, "y": 25},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -651,7 +691,7 @@ def build_overview():
             16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_REGEX),
-            {"h": 7, "w": 12, "x": 0, "y": 27},
+            {"h": 7, "w": 12, "x": 0, "y": 33},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -663,7 +703,7 @@ def build_overview():
             17,
             "Control plane RAM",
             node_mem_expr(CONTROL_REGEX),
-            {"h": 7, "w": 12, "x": 12, "y": 27},
+            {"h": 7, "w": 12, "x": 12, "y": 33},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -676,9 +716,9 @@ def build_overview():
             18,
             "Cluster ingress throughput",
             NET_INGRESS_EXPR,
-            {"h": 7, "w": 12, "x": 0, "y": 34},
+            {"h": 7, "w": 8, "x": 0, "y": 40},
             unit="Bps",
-            legend="Ingress",
+            legend="Ingress (Traefik)",
             legend_display="list",
             legend_placement="bottom",
             links=link_to("atlas-network"),
@@ -689,9 +729,22 @@ def build_overview():
             19,
             "Cluster egress throughput",
             NET_EGRESS_EXPR,
-            {"h": 7, "w": 12, "x": 12, "y": 34},
+            {"h": 7, "w": 8, "x": 8, "y": 40},
             unit="Bps",
-            legend="Egress",
+            legend="Egress (Traefik)",
+            legend_display="list",
+            legend_placement="bottom",
+            links=link_to("atlas-network"),
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            20,
+            "Intra-cluster throughput",
+            NET_INTERNAL_EXPR,
+            {"h": 7, "w": 8, "x": 16, "y": 40},
+            unit="Bps",
+            legend="Internal traffic",
             legend_display="list",
             legend_placement="bottom",
             links=link_to("atlas-network"),
@@ -700,10 +753,10 @@ def build_overview():
 
     panels.append(
         timeseries_panel(
-            20,
+            21,
             "Root filesystem usage",
             root_usage_expr(),
-            {"h": 8, "w": 12, "x": 0, "y": 41},
+            {"h": 8, "w": 12, "x": 0, "y": 47},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -715,11 +768,11 @@ def build_overview():
     )
     panels.append(
         {
-            "id": 21,
+            "id": 22,
             "type": "bargauge",
             "title": "Nodes closest to full root disks",
             "datasource": PROM_DS,
-            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
+            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47},
             "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
             "fieldConfig": {
                 "defaults": {
@@ -744,28 +797,10 @@ def build_overview():
                 "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
             },
             "links": link_to("atlas-storage"),
+            "transformations": [{"id": "labelsToFields", "options": {}}],
         }
     )
 
-    storage_panels = [
-        (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
-        (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
-        (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
-        (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
-    ]
-    for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
-        panels.append(
-            stat_panel(
-                panel_id,
-                title,
-                expr,
-                {"h": 6, "w": 6, "x": 6 * idx, "y": 49},
-                unit=unit,
-                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
-                links=link_to("atlas-storage"),
-            )
-        )
-
     return {
         "uid": "atlas-overview",
         "title": "Atlas Overview",
@@ -1110,12 +1145,15 @@ def build_network_dashboard():
     panels.append(
         stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
     )
+    panels.append(
+        stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps")
+    )
     panels.append(
         stat_panel(
-            3,
+            4,
             "Top router req/s",
             f"topk(1, {TRAEFIK_ROUTER_EXPR})",
-            {"h": 4, "w": 8, "x": 16, "y": 0},
+            {"h": 4, "w": 8, "x": 0, "y": 4},
             unit="req/s",
             legend="{{router}}",
             instant=True,
@@ -1123,10 +1161,10 @@ def build_network_dashboard():
     )
     panels.append(
         timeseries_panel(
-            4,
+            5,
             "Per-node throughput",
             node_net_expr(),
-            {"h": 8, "w": 24, "x": 0, "y": 4},
+            {"h": 8, "w": 24, "x": 0, "y": 8},
             unit="Bps",
             legend="{{node}}",
             legend_display="table",
@@ -1135,32 +1173,32 @@ def build_network_dashboard():
     )
     panels.append(
         table_panel(
-            5,
+            6,
             "Top namespaces",
             'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
             '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
-            {"h": 9, "w": 12, "x": 0, "y": 12},
+            {"h": 9, "w": 12, "x": 0, "y": 16},
             unit="Bps",
             transformations=[{"id": "labelsToFields", "options": {}}],
         )
     )
     panels.append(
         table_panel(
-            6,
+            7,
             "Top pods",
             'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
             '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
-            {"h": 9, "w": 12, "x": 12, "y": 12},
+            {"h": 9, "w": 12, "x": 12, "y": 16},
             unit="Bps",
             transformations=[{"id": "labelsToFields", "options": {}}],
         )
     )
     panels.append(
         timeseries_panel(
-            7,
+            8,
             "Traefik routers (req/s)",
             f"topk(10, {TRAEFIK_ROUTER_EXPR})",
-            {"h": 9, "w": 12, "x": 0, "y": 21},
+            {"h": 9, "w": 12, "x": 0, "y": 25},
             unit="req/s",
             legend="{{router}}",
             legend_display="table",
@@ -1169,10 +1207,10 @@ def build_network_dashboard():
     )
     panels.append(
         timeseries_panel(
-            8,
+            9,
             "Traefik entrypoints (req/s)",
             'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
-            {"h": 9, "w": 12, "x": 12, "y": 21},
+            {"h": 9, "w": 12, "x": 12, "y": 25},
             unit="req/s",
             legend="{{entrypoint}}",
             legend_display="table",
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 098e1db..1baec3a 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -80,7 +80,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -127,7 +127,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Top router req/s",
+      "title": "Intra-cluster traffic",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -138,6 +138,66 @@
         "x": 16,
         "y": 0
       },
+      "targets": [
+        {
+          "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "Bps",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Top router req/s",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 0,
+        "y": 4
+      },
       "targets": [
         {
           "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
@@ -187,7 +247,7 @@
       }
     },
     {
-      "id": 4,
+      "id": 5,
       "type": "timeseries",
       "title": "Per-node throughput",
       "datasource": {
@@ -198,7 +258,7 @@
         "h": 8,
         "w": 24,
         "x": 0,
-        "y": 4
+        "y": 8
       },
       "targets": [
         {
@@ -224,7 +284,7 @@
       }
     },
     {
-      "id": 5,
+      "id": 6,
       "type": "table",
       "title": "Top namespaces",
       "datasource": {
@@ -235,7 +295,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 12
+        "y": 16
       },
       "targets": [
         {
@@ -260,7 +320,7 @@
       ]
     },
     {
-      "id": 6,
+      "id": 7,
       "type": "table",
       "title": "Top pods",
       "datasource": {
@@ -271,7 +331,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 12
+        "y": 16
       },
       "targets": [
         {
@@ -296,7 +356,7 @@
       ]
     },
     {
-      "id": 7,
+      "id": 8,
       "type": "timeseries",
       "title": "Traefik routers (req/s)",
       "datasource": {
@@ -307,7 +367,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 21
+        "y": 25
       },
       "targets": [
         {
@@ -333,7 +393,7 @@
       }
     },
     {
-      "id": 8,
+      "id": 9,
       "type": "timeseries",
       "title": "Traefik entrypoints (req/s)",
       "datasource": {
@@ -344,7 +404,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 21
+        "y": 25
       },
       "targets": [
         {
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index ad460bb..eba6466 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -325,59 +325,6 @@
         }
       ]
     },
-    {
-      "id": 6,
-      "type": "gauge",
-      "title": "Running pods",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 5,
-        "w": 4,
-        "x": 20,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "min": 0,
-          "max": 5,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 5
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "orientation": "auto",
-        "showThresholdMarkers": false,
-        "showThresholdLabels": false
-      }
-    },
     {
       "id": 7,
       "type": "stat",
@@ -663,506 +610,7 @@
       ]
     },
     {
-      "id": 11,
-      "type": "piechart",
-      "title": "Namespace CPU share",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 8,
-        "x": 0,
-        "y": 10
-      },
-      "targets": [
-        {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
-          "refId": "A",
-          "legendFormat": "{{namespace}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "right"
-        },
-        "pieType": "pie",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      }
-    },
-    {
-      "id": 12,
-      "type": "piechart",
-      "title": "Namespace GPU share",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 8,
-        "x": 8,
-        "y": 10
-      },
-      "targets": [
-        {
-          "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
-          "refId": "A",
-          "legendFormat": "{{namespace}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "right"
-        },
-        "pieType": "pie",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      }
-    },
-    {
-      "id": 13,
-      "type": "piechart",
-      "title": "Namespace RAM share",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 9,
-        "w": 8,
-        "x": 16,
-        "y": 10
-      },
-      "targets": [
-        {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
-          "refId": "A",
-          "legendFormat": "{{namespace}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "right"
-        },
-        "pieType": "pie",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      }
-    },
-    {
-      "id": 14,
-      "type": "timeseries",
-      "title": "Worker node CPU",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 19
-      },
-      "targets": [
-        {
-          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right",
-          "calcs": [
-            "last"
-          ]
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "links": [
-        {
-          "title": "Open atlas-nodes dashboard",
-          "url": "/d/atlas-nodes",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 15,
-      "type": "timeseries",
-      "title": "Worker node RAM",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 19
-      },
-      "targets": [
-        {
-          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right",
-          "calcs": [
-            "last"
-          ]
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "links": [
-        {
-          "title": "Open atlas-nodes dashboard",
-          "url": "/d/atlas-nodes",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 16,
-      "type": "timeseries",
-      "title": "Control plane CPU",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 27
-      },
-      "targets": [
-        {
-          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      }
-    },
-    {
-      "id": 17,
-      "type": "timeseries",
-      "title": "Control plane RAM",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 27
-      },
-      "targets": [
-        {
-          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      }
-    },
-    {
-      "id": 18,
-      "type": "timeseries",
-      "title": "Cluster ingress throughput",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 34
-      },
-      "targets": [
-        {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
-          "refId": "A",
-          "legendFormat": "Ingress"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "Bps"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "links": [
-        {
-          "title": "Open atlas-network dashboard",
-          "url": "/d/atlas-network",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 19,
-      "type": "timeseries",
-      "title": "Cluster egress throughput",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 34
-      },
-      "targets": [
-        {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
-          "refId": "A",
-          "legendFormat": "Egress"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "Bps"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "links": [
-        {
-          "title": "Open atlas-network dashboard",
-          "url": "/d/atlas-network",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 20,
-      "type": "timeseries",
-      "title": "Root filesystem usage",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 41
-      },
-      "targets": [
-        {
-          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right",
-          "calcs": [
-            "last"
-          ]
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "timeFrom": "30d",
-      "links": [
-        {
-          "title": "Open atlas-storage dashboard",
-          "url": "/d/atlas-storage",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 21,
-      "type": "bargauge",
-      "title": "Nodes closest to full root disks",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 41
-      },
-      "targets": [
-        {
-          "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-          "refId": "A",
-          "legendFormat": "{{node}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "percent",
-          "min": 0,
-          "max": 100,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 50
-              },
-              {
-                "color": "orange",
-                "value": 70
-              },
-              {
-                "color": "red",
-                "value": 85
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "displayMode": "gradient",
-        "orientation": "horizontal",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      },
-      "links": [
-        {
-          "title": "Open atlas-storage dashboard",
-          "url": "/d/atlas-storage",
-          "targetBlank": true
-        }
-      ]
-    },
-    {
-      "id": 21,
+      "id": 23,
       "type": "stat",
       "title": "Astreae usage",
       "datasource": {
@@ -1173,7 +621,7 @@
         "h": 6,
         "w": 6,
         "x": 0,
-        "y": 49
+        "y": 10
       },
       "targets": [
         {
@@ -1233,7 +681,7 @@
       ]
     },
     {
-      "id": 22,
+      "id": 24,
       "type": "stat",
       "title": "Asteria usage",
       "datasource": {
@@ -1244,7 +692,7 @@
         "h": 6,
         "w": 6,
         "x": 6,
-        "y": 49
+        "y": 10
       },
       "targets": [
         {
@@ -1304,7 +752,7 @@
       ]
     },
     {
-      "id": 23,
+      "id": 25,
       "type": "stat",
       "title": "Astreae free",
       "datasource": {
@@ -1315,7 +763,7 @@
         "h": 6,
         "w": 6,
         "x": 12,
-        "y": 49
+        "y": 10
       },
       "targets": [
         {
@@ -1371,7 +819,7 @@
       ]
     },
     {
-      "id": 24,
+      "id": 26,
       "type": "stat",
       "title": "Asteria free",
       "datasource": {
@@ -1382,7 +830,7 @@
         "h": 6,
         "w": 6,
         "x": 18,
-        "y": 49
+        "y": 10
       },
       "targets": [
         {
@@ -1436,6 +884,588 @@
           "targetBlank": true
         }
       ]
+    },
+    {
+      "id": 11,
+      "type": "piechart",
+      "title": "Namespace CPU share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 0,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "displayLabels": [
+          "percent"
+        ],
+        "tooltip": {
+          "mode": "single"
+        },
+        "colorScheme": "interpolateSpectral",
+        "colorBy": "value",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 12,
+      "type": "piechart",
+      "title": "Namespace GPU share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 8,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "displayLabels": [
+          "percent"
+        ],
+        "tooltip": {
+          "mode": "single"
+        },
+        "colorScheme": "interpolateSpectral",
+        "colorBy": "value",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 13,
+      "type": "piechart",
+      "title": "Namespace RAM share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 8,
+        "x": 16,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "displayLabels": [
+          "percent"
+        ],
+        "tooltip": {
+          "mode": "single"
+        },
+        "colorScheme": "interpolateSpectral",
+        "colorBy": "value",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 14,
+      "type": "timeseries",
+      "title": "Worker node CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 25
+      },
+      "targets": [
+        {
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 15,
+      "type": "timeseries",
+      "title": "Worker node RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 25
+      },
+      "targets": [
+        {
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-nodes dashboard",
+          "url": "/d/atlas-nodes",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 16,
+      "type": "timeseries",
+      "title": "Control plane CPU",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 33
+      },
+      "targets": [
+        {
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 17,
+      "type": "timeseries",
+      "title": "Control plane RAM",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 33
+      },
+      "targets": [
+        {
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 18,
+      "type": "timeseries",
+      "title": "Cluster ingress throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 40
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+          "refId": "A",
+          "legendFormat": "Ingress (Traefik)"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-network dashboard",
+          "url": "/d/atlas-network",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 19,
+      "type": "timeseries",
+      "title": "Cluster egress throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 8,
+        "y": 40
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+          "refId": "A",
+          "legendFormat": "Egress (Traefik)"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-network dashboard",
+          "url": "/d/atlas-network",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 20,
+      "type": "timeseries",
+      "title": "Intra-cluster throughput",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 16,
+        "y": 40
+      },
+      "targets": [
+        {
+          "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+          "refId": "A",
+          "legendFormat": "Internal traffic"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "Bps"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-network dashboard",
+          "url": "/d/atlas-network",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 21,
+      "type": "timeseries",
+      "title": "Root filesystem usage",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 47
+      },
+      "targets": [
+        {
+          "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "calcs": [
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "timeFrom": "30d",
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ]
+    },
+    {
+      "id": 22,
+      "type": "bargauge",
+      "title": "Nodes closest to full root disks",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 47
+      },
+      "targets": [
+        {
+          "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "refId": "A",
+          "legendFormat": "{{node}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "orange",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "links": [
+        {
+          "title": "Open atlas-storage dashboard",
+          "url": "/d/atlas-storage",
+          "targetBlank": true
+        }
+      ],
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
     }
   ],
   "schemaVersion": 39,
diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
new file mode 100644
index 0000000..efd32c5
--- /dev/null
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -0,0 +1,74 @@
+# services/monitoring/dcgm-exporter.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: dcgm-exporter
+  namespace: monitoring
+  labels:
+    app: dcgm-exporter
+spec:
+  selector:
+    matchLabels:
+      app: dcgm-exporter
+  template:
+    metadata:
+      labels:
+        app: dcgm-exporter
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9400"
+    spec:
+      serviceAccountName: default
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values:
+                      - titan-20
+                      - titan-21
+                      - titan-22
+                      - titan-24
+      tolerations:
+        - operator: Exists
+      containers:
+        - name: dcgm-exporter
+          image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: metrics
+              containerPort: 9400
+          env:
+            - name: DCGM_EXPORTER_KUBERNETES
+              value: "true"
+          securityContext:
+            privileged: true
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+          volumeMounts:
+            - name: pod-resources
+              mountPath: /var/lib/kubelet/pod-resources
+      volumes:
+        - name: pod-resources
+          hostPath:
+            path: /var/lib/kubelet/pod-resources
+            type: Directory
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: dcgm-exporter
+  namespace: monitoring
+  labels:
+    app: dcgm-exporter
+spec:
+  selector:
+    app: dcgm-exporter
+  ports:
+    - name: metrics
+      port: 9400
+      targetPort: metrics
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index a552793..ade7457 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -89,7 +89,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -136,7 +136,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Top router req/s",
+          "title": "Intra-cluster traffic",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -147,6 +147,66 @@ data:
             "x": 16,
             "y": 0
           },
+          "targets": [
+            {
+              "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "Bps",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 4,
+          "type": "stat",
+          "title": "Top router req/s",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 8,
+            "x": 0,
+            "y": 4
+          },
           "targets": [
             {
               "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
@@ -196,7 +256,7 @@ data:
           }
         },
         {
-          "id": 4,
+          "id": 5,
           "type": "timeseries",
           "title": "Per-node throughput",
           "datasource": {
@@ -207,7 +267,7 @@ data:
             "h": 8,
             "w": 24,
             "x": 0,
-            "y": 4
+            "y": 8
           },
           "targets": [
             {
@@ -233,7 +293,7 @@ data:
           }
         },
         {
-          "id": 5,
+          "id": 6,
           "type": "table",
           "title": "Top namespaces",
           "datasource": {
@@ -244,7 +304,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 12
+            "y": 16
           },
           "targets": [
             {
@@ -269,7 +329,7 @@ data:
           ]
         },
         {
-          "id": 6,
+          "id": 7,
           "type": "table",
           "title": "Top pods",
           "datasource": {
@@ -280,7 +340,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 12
+            "y": 16
           },
           "targets": [
             {
@@ -305,7 +365,7 @@ data:
           ]
         },
         {
-          "id": 7,
+          "id": 8,
           "type": "timeseries",
           "title": "Traefik routers (req/s)",
           "datasource": {
@@ -316,7 +376,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 21
+            "y": 25
           },
           "targets": [
             {
@@ -342,7 +402,7 @@ data:
           }
         },
         {
-          "id": 8,
+          "id": 9,
           "type": "timeseries",
           "title": "Traefik entrypoints (req/s)",
           "datasource": {
@@ -353,7 +413,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 21
+            "y": 25
           },
           "targets": [
             {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 6503da9..d20a5a4 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -334,59 +334,6 @@ data:
             }
           ]
         },
-        {
-          "id": 6,
-          "type": "gauge",
-          "title": "Running pods",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 5,
-            "w": 4,
-            "x": 20,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum(kube_pod_status_phase{phase=\"Running\"})",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "min": 0,
-              "max": 5,
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "red",
-                    "value": 5
-                  }
-                ]
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "orientation": "auto",
-            "showThresholdMarkers": false,
-            "showThresholdLabels": false
-          }
-        },
         {
           "id": 7,
           "type": "stat",
@@ -672,506 +619,7 @@ data:
           ]
         },
         {
-          "id": 11,
-          "type": "piechart",
-          "title": "Namespace CPU share",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 8,
-            "x": 0,
-            "y": 10
-          },
-          "targets": [
-            {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
-              "refId": "A",
-              "legendFormat": "{{namespace}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "right"
-            },
-            "pieType": "pie",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          }
-        },
-        {
-          "id": 12,
-          "type": "piechart",
-          "title": "Namespace GPU share",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 8,
-            "x": 8,
-            "y": 10
-          },
-          "targets": [
-            {
-              "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
-              "refId": "A",
-              "legendFormat": "{{namespace}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "right"
-            },
-            "pieType": "pie",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          }
-        },
-        {
-          "id": 13,
-          "type": "piechart",
-          "title": "Namespace RAM share",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 9,
-            "w": 8,
-            "x": 16,
-            "y": 10
-          },
-          "targets": [
-            {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
-              "refId": "A",
-              "legendFormat": "{{namespace}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "right"
-            },
-            "pieType": "pie",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          }
-        },
-        {
-          "id": 14,
-          "type": "timeseries",
-          "title": "Worker node CPU",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 19
-          },
-          "targets": [
-            {
-              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "right",
-              "calcs": [
-                "last"
-              ]
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
-          "links": [
-            {
-              "title": "Open atlas-nodes dashboard",
-              "url": "/d/atlas-nodes",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 15,
-          "type": "timeseries",
-          "title": "Worker node RAM",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 19
-          },
-          "targets": [
-            {
-              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "right",
-              "calcs": [
-                "last"
-              ]
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
-          "links": [
-            {
-              "title": "Open atlas-nodes dashboard",
-              "url": "/d/atlas-nodes",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 16,
-          "type": "timeseries",
-          "title": "Control plane CPU",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 0,
-            "y": 27
-          },
-          "targets": [
-            {
-              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "right"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          }
-        },
-        {
-          "id": 17,
-          "type": "timeseries",
-          "title": "Control plane RAM",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 12,
-            "y": 27
-          },
-          "targets": [
-            {
-              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "right"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          }
-        },
-        {
-          "id": 18,
-          "type": "timeseries",
-          "title": "Cluster ingress throughput",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 0,
-            "y": 34
-          },
-          "targets": [
-            {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
-              "refId": "A",
-              "legendFormat": "Ingress"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "Bps"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
-          "links": [
-            {
-              "title": "Open atlas-network dashboard",
-              "url": "/d/atlas-network",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 19,
-          "type": "timeseries",
-          "title": "Cluster egress throughput",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 7,
-            "w": 12,
-            "x": 12,
-            "y": 34
-          },
-          "targets": [
-            {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
-              "refId": "A",
-              "legendFormat": "Egress"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "Bps"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "list",
-              "placement": "bottom"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
-          "links": [
-            {
-              "title": "Open atlas-network dashboard",
-              "url": "/d/atlas-network",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 20,
-          "type": "timeseries",
-          "title": "Root filesystem usage",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 41
-          },
-          "targets": [
-            {
-              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "right",
-              "calcs": [
-                "last"
-              ]
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          },
-          "timeFrom": "30d",
-          "links": [
-            {
-              "title": "Open atlas-storage dashboard",
-              "url": "/d/atlas-storage",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 21,
-          "type": "bargauge",
-          "title": "Nodes closest to full root disks",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 41
-          },
-          "targets": [
-            {
-              "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
-              "refId": "A",
-              "legendFormat": "{{node}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "percent",
-              "min": 0,
-              "max": 100,
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 50
-                  },
-                  {
-                    "color": "orange",
-                    "value": 70
-                  },
-                  {
-                    "color": "red",
-                    "value": 85
-                  }
-                ]
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "displayMode": "gradient",
-            "orientation": "horizontal",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          },
-          "links": [
-            {
-              "title": "Open atlas-storage dashboard",
-              "url": "/d/atlas-storage",
-              "targetBlank": true
-            }
-          ]
-        },
-        {
-          "id": 21,
+          "id": 23,
           "type": "stat",
           "title": "Astreae usage",
           "datasource": {
@@ -1182,7 +630,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 0,
-            "y": 49
+            "y": 10
           },
           "targets": [
             {
@@ -1242,7 +690,7 @@ data:
           ]
         },
         {
-          "id": 22,
+          "id": 24,
           "type": "stat",
           "title": "Asteria usage",
           "datasource": {
@@ -1253,7 +701,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 6,
-            "y": 49
+            "y": 10
           },
           "targets": [
             {
@@ -1313,7 +761,7 @@ data:
           ]
         },
         {
-          "id": 23,
+          "id": 25,
           "type": "stat",
           "title": "Astreae free",
           "datasource": {
@@ -1324,7 +772,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 12,
-            "y": 49
+            "y": 10
           },
           "targets": [
             {
@@ -1380,7 +828,7 @@ data:
           ]
         },
         {
-          "id": 24,
+          "id": 26,
           "type": "stat",
           "title": "Asteria free",
           "datasource": {
@@ -1391,7 +839,7 @@ data:
             "h": 6,
             "w": 6,
             "x": 18,
-            "y": 49
+            "y": 10
           },
           "targets": [
             {
@@ -1445,6 +893,588 @@ data:
               "targetBlank": true
             }
           ]
+        },
+        {
+          "id": 11,
+          "type": "piechart",
+          "title": "Namespace CPU share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 0,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "color": {
+                "mode": "palette-classic"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "displayLabels": [
+              "percent"
+            ],
+            "tooltip": {
+              "mode": "single"
+            },
+            "colorScheme": "interpolateSpectral",
+            "colorBy": "value",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 12,
+          "type": "piechart",
+          "title": "Namespace GPU share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 8,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "color": {
+                "mode": "palette-classic"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "displayLabels": [
+              "percent"
+            ],
+            "tooltip": {
+              "mode": "single"
+            },
+            "colorScheme": "interpolateSpectral",
+            "colorBy": "value",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 13,
+          "type": "piechart",
+          "title": "Namespace RAM share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 16,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "color": {
+                "mode": "palette-classic"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "displayLabels": [
+              "percent"
+            ],
+            "tooltip": {
+              "mode": "single"
+            },
+            "colorScheme": "interpolateSpectral",
+            "colorBy": "value",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 14,
+          "type": "timeseries",
+          "title": "Worker node CPU",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 25
+          },
+          "targets": [
+            {
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 15,
+          "type": "timeseries",
+          "title": "Worker node RAM",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 25
+          },
+          "targets": [
+            {
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-nodes dashboard",
+              "url": "/d/atlas-nodes",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 16,
+          "type": "timeseries",
+          "title": "Control plane CPU",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 33
+          },
+          "targets": [
+            {
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 17,
+          "type": "timeseries",
+          "title": "Control plane RAM",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 33
+          },
+          "targets": [
+            {
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 18,
+          "type": "timeseries",
+          "title": "Cluster ingress throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 8,
+            "x": 0,
+            "y": 40
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+              "refId": "A",
+              "legendFormat": "Ingress (Traefik)"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "Bps"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-network dashboard",
+              "url": "/d/atlas-network",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 19,
+          "type": "timeseries",
+          "title": "Cluster egress throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 8,
+            "x": 8,
+            "y": 40
+          },
+          "targets": [
+            {
+              "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+              "refId": "A",
+              "legendFormat": "Egress (Traefik)"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "Bps"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-network dashboard",
+              "url": "/d/atlas-network",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 20,
+          "type": "timeseries",
+          "title": "Intra-cluster throughput",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 8,
+            "x": 16,
+            "y": 40
+          },
+          "targets": [
+            {
+              "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+              "refId": "A",
+              "legendFormat": "Internal traffic"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "Bps"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-network dashboard",
+              "url": "/d/atlas-network",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 21,
+          "type": "timeseries",
+          "title": "Root filesystem usage",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 47
+          },
+          "targets": [
+            {
+              "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right",
+              "calcs": [
+                "last"
+              ]
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "timeFrom": "30d",
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ]
+        },
+        {
+          "id": 22,
+          "type": "bargauge",
+          "title": "Nodes closest to full root disks",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 47
+          },
+          "targets": [
+            {
+              "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "refId": "A",
+              "legendFormat": "{{node}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "orange",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "links": [
+            {
+              "title": "Open atlas-storage dashboard",
+              "url": "/d/atlas-storage",
+              "targetBlank": true
+            }
+          ],
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
         }
       ],
       "schemaVersion": 39,
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 76263c1..3164862 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -10,5 +10,6 @@ resources:
   - grafana-dashboard-nodes.yaml
   - grafana-dashboard-storage.yaml
   - grafana-dashboard-network.yaml
+  - dcgm-exporter.yaml
   - grafana-folders.yaml
   - helmrelease.yaml

From 46410c9a9dff467c788112421df680b5b2c7b441 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 14:19:23 -0300
Subject: [PATCH 47/71] monitoring: fix dcgm image

---
 services/monitoring/dcgm-exporter.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index efd32c5..eaa3930 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -35,7 +35,7 @@ spec:
         - operator: Exists
       containers:
         - name: dcgm-exporter
-          image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1
+          image: docker.io/nvidia/dcgm-exporter:3.3.5-1-ubuntu22.04
           imagePullPolicy: IfNotPresent
           ports:
             - name: metrics

From 5a2575d54eff036ae3523180acdcc1acaab8269b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 14:33:24 -0300
Subject: [PATCH 48/71] flux: scope monitoring health checks

---
 .../platform/monitoring/kustomization.yaml      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
index 2899531..f684773 100644
--- a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
@@ -12,3 +12,20 @@ spec:
     kind: GitRepository
     name: flux-system
   wait: true
+  healthChecks:
+    - apiVersion: helm.toolkit.fluxcd.io/v2
+      kind: HelmRelease
+      name: grafana
+      namespace: monitoring
+    - apiVersion: helm.toolkit.fluxcd.io/v2
+      kind: HelmRelease
+      name: victoria-metrics-single
+      namespace: monitoring
+    - apiVersion: helm.toolkit.fluxcd.io/v2
+      kind: HelmRelease
+      name: node-exporter
+      namespace: monitoring
+    - apiVersion: helm.toolkit.fluxcd.io/v2
+      kind: HelmRelease
+      name: alertmanager
+      namespace: monitoring

From 909cb4ff26565da718099180ab784bd793d1a508 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 15:04:18 -0300
Subject: [PATCH 49/71] flux: disable wait for monitoring

---
 .../platform/monitoring/kustomization.yaml    | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
index f684773..82ad672 100644
--- a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml
@@ -11,21 +11,4 @@ spec:
   sourceRef:
     kind: GitRepository
     name: flux-system
-  wait: true
-  healthChecks:
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
-      name: grafana
-      namespace: monitoring
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
-      name: victoria-metrics-single
-      namespace: monitoring
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
-      name: node-exporter
-      namespace: monitoring
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
-      name: alertmanager
-      namespace: monitoring
+  wait: false

From 7b2a69cfe3c2fdcb08aa123dc4dcb3a6c90f0925 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 15:10:58 -0300
Subject: [PATCH 50/71] monitoring: disable dcgm exporter

---
 services/monitoring/kustomization.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 3164862..76263c1 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -10,6 +10,5 @@ resources:
   - grafana-dashboard-nodes.yaml
   - grafana-dashboard-storage.yaml
   - grafana-dashboard-network.yaml
-  - dcgm-exporter.yaml
   - grafana-folders.yaml
   - helmrelease.yaml

From c7b7bc7a6db29567595878dd6d6786dd2a0a18bb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 15:55:24 -0300
Subject: [PATCH 51/71] monitoring: adjust overview spacing and net panels

---
 scripts/render_dashboards.py                  | 32 +++++++++++++------
 .../monitoring/dashboards/atlas-network.json  |  2 +-
 .../monitoring/dashboards/atlas-overview.json | 32 +++++++++----------
 .../monitoring/grafana-dashboard-network.yaml |  2 +-
 .../grafana-dashboard-overview.yaml           | 32 +++++++++----------
 5 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 273090a..bf06d40 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -81,6 +81,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
 WORKER_SUFFIX = f"/{WORKER_TOTAL}"
 CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring"
 LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
+GAUGE_WIDTHS = [5, 5, 5, 5, 4]
 
 # ---------------------------------------------------------------------------
 # PromQL helpers
@@ -262,13 +263,18 @@ TRAEFIK_NET_EGRESS = (
     'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
     " or on() vector(0)"
 )
-NET_TOTAL_EXPR = (
+NET_CLUSTER_RX = (
+    'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
+    " or on() vector(0)"
+)
+NET_CLUSTER_TX = (
     'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
     " or on() vector(0)"
 )
+NET_TOTAL_EXPR = NET_CLUSTER_TX
 NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS
 NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS
-NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)"
+NET_INTERNAL_EXPR = f"clamp_min((({NET_CLUSTER_RX}) + ({NET_CLUSTER_TX})) - (({TRAEFIK_NET_INGRESS}) + ({TRAEFIK_NET_EGRESS})), 0)"
 
 # ---------------------------------------------------------------------------
 # Panel factories
@@ -534,6 +540,11 @@ def build_overview():
             link_to("atlas-pods"),
         ),
     ]
+    def gauge_grid(idx):
+        width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
+        x = sum(GAUGE_WIDTHS[:idx])
+        return width, x
+
     for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
         thresholds = None
         min_value = 0
@@ -577,12 +588,13 @@ def build_overview():
                     {"color": "red", "value": max_value},
                 ],
             }
+        width, x = gauge_grid(idx)
         panels.append(
             gauge_panel(
                 panel_id,
                 title,
                 expr,
-                {"h": 5, "w": 4, "x": 4 * idx, "y": 0},
+                {"h": 5, "w": width, "x": x, "y": 0},
                 min_value=min_value,
                 max_value=max_value,
                 thresholds=thresholds,
@@ -662,7 +674,7 @@ def build_overview():
             14,
             "Worker node CPU",
             node_cpu_expr(worker_filter),
-            {"h": 8, "w": 12, "x": 0, "y": 25},
+            {"h": 8, "w": 12, "x": 0, "y": 32},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -676,7 +688,7 @@ def build_overview():
             15,
             "Worker node RAM",
             node_mem_expr(worker_filter),
-            {"h": 8, "w": 12, "x": 12, "y": 25},
+            {"h": 8, "w": 12, "x": 12, "y": 32},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -691,7 +703,7 @@ def build_overview():
             16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_REGEX),
-            {"h": 7, "w": 12, "x": 0, "y": 33},
+            {"h": 7, "w": 12, "x": 0, "y": 40},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -703,7 +715,7 @@ def build_overview():
             17,
             "Control plane RAM",
             node_mem_expr(CONTROL_REGEX),
-            {"h": 7, "w": 12, "x": 12, "y": 33},
+            {"h": 7, "w": 12, "x": 12, "y": 40},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -716,7 +728,7 @@ def build_overview():
             18,
             "Cluster ingress throughput",
             NET_INGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 0, "y": 40},
+            {"h": 7, "w": 8, "x": 0, "y": 25},
             unit="Bps",
             legend="Ingress (Traefik)",
             legend_display="list",
@@ -729,7 +741,7 @@ def build_overview():
             19,
             "Cluster egress throughput",
             NET_EGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 8, "y": 40},
+            {"h": 7, "w": 8, "x": 8, "y": 25},
             unit="Bps",
             legend="Egress (Traefik)",
             legend_display="list",
@@ -742,7 +754,7 @@ def build_overview():
             20,
             "Intra-cluster throughput",
             NET_INTERNAL_EXPR,
-            {"h": 7, "w": 8, "x": 16, "y": 40},
+            {"h": 7, "w": 8, "x": 16, "y": 25},
             unit="Bps",
             legend="Internal traffic",
             legend_display="list",
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 1baec3a..8a8b8f4 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -140,7 +140,7 @@
       },
       "targets": [
         {
-          "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+          "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index eba6466..4cd4b29 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -17,7 +17,7 @@
       },
       "gridPos": {
         "h": 5,
-        "w": 4,
+        "w": 5,
         "x": 0,
         "y": 0
       },
@@ -78,8 +78,8 @@
       },
       "gridPos": {
         "h": 5,
-        "w": 4,
-        "x": 4,
+        "w": 5,
+        "x": 5,
         "y": 0
       },
       "targets": [
@@ -131,8 +131,8 @@
       },
       "gridPos": {
         "h": 5,
-        "w": 4,
-        "x": 8,
+        "w": 5,
+        "x": 10,
         "y": 0
       },
       "targets": [
@@ -199,8 +199,8 @@
       },
       "gridPos": {
         "h": 5,
-        "w": 4,
-        "x": 12,
+        "w": 5,
+        "x": 15,
         "y": 0
       },
       "targets": [
@@ -268,7 +268,7 @@
       "gridPos": {
         "h": 5,
         "w": 4,
-        "x": 16,
+        "x": 20,
         "y": 0
       },
       "targets": [
@@ -1056,7 +1056,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 25
+        "y": 32
       },
       "targets": [
         {
@@ -1103,7 +1103,7 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 25
+        "y": 32
       },
       "targets": [
         {
@@ -1150,7 +1150,7 @@
         "h": 7,
         "w": 12,
         "x": 0,
-        "y": 33
+        "y": 40
       },
       "targets": [
         {
@@ -1187,7 +1187,7 @@
         "h": 7,
         "w": 12,
         "x": 12,
-        "y": 33
+        "y": 40
       },
       "targets": [
         {
@@ -1224,7 +1224,7 @@
         "h": 7,
         "w": 8,
         "x": 0,
-        "y": 40
+        "y": 25
       },
       "targets": [
         {
@@ -1268,7 +1268,7 @@
         "h": 7,
         "w": 8,
         "x": 8,
-        "y": 40
+        "y": 25
       },
       "targets": [
         {
@@ -1312,11 +1312,11 @@
         "h": 7,
         "w": 8,
         "x": 16,
-        "y": 40
+        "y": 25
       },
       "targets": [
         {
-          "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+          "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
           "refId": "A",
           "legendFormat": "Internal traffic"
         }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index ade7457..1727e6a 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -149,7 +149,7 @@ data:
           },
           "targets": [
             {
-              "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+              "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index d20a5a4..99d6d46 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -26,7 +26,7 @@ data:
           },
           "gridPos": {
             "h": 5,
-            "w": 4,
+            "w": 5,
             "x": 0,
             "y": 0
           },
@@ -87,8 +87,8 @@ data:
           },
           "gridPos": {
             "h": 5,
-            "w": 4,
-            "x": 4,
+            "w": 5,
+            "x": 5,
             "y": 0
           },
           "targets": [
@@ -140,8 +140,8 @@ data:
           },
           "gridPos": {
             "h": 5,
-            "w": 4,
-            "x": 8,
+            "w": 5,
+            "x": 10,
             "y": 0
           },
           "targets": [
@@ -208,8 +208,8 @@ data:
           },
           "gridPos": {
             "h": 5,
-            "w": 4,
-            "x": 12,
+            "w": 5,
+            "x": 15,
             "y": 0
           },
           "targets": [
@@ -277,7 +277,7 @@ data:
           "gridPos": {
             "h": 5,
             "w": 4,
-            "x": 16,
+            "x": 20,
             "y": 0
           },
           "targets": [
@@ -1065,7 +1065,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 25
+            "y": 32
           },
           "targets": [
             {
@@ -1112,7 +1112,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 25
+            "y": 32
           },
           "targets": [
             {
@@ -1159,7 +1159,7 @@ data:
             "h": 7,
             "w": 12,
             "x": 0,
-            "y": 33
+            "y": 40
           },
           "targets": [
             {
@@ -1196,7 +1196,7 @@ data:
             "h": 7,
             "w": 12,
             "x": 12,
-            "y": 33
+            "y": 40
           },
           "targets": [
             {
@@ -1233,7 +1233,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 0,
-            "y": 40
+            "y": 25
           },
           "targets": [
             {
@@ -1277,7 +1277,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 8,
-            "y": 40
+            "y": 25
           },
           "targets": [
             {
@@ -1321,11 +1321,11 @@ data:
             "h": 7,
             "w": 8,
             "x": 16,
-            "y": 40
+            "y": 25
           },
           "targets": [
             {
-              "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)",
+              "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
               "refId": "A",
               "legendFormat": "Internal traffic"
             }

From f06be37f44e53c3f80870fca822b421f61e86901 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 16:18:52 -0300
Subject: [PATCH 52/71] monitoring: refine network metrics and control-plane
 allowance

---
 scripts/render_dashboards.py                  | 21 ++++++++++++++-----
 .../monitoring/dashboards/atlas-network.json  |  6 +++---
 .../monitoring/dashboards/atlas-nodes.json    |  2 +-
 .../monitoring/dashboards/atlas-overview.json |  8 +++----
 .../monitoring/dashboards/atlas-pods.json     |  2 +-
 .../monitoring/grafana-dashboard-network.yaml |  6 +++---
 .../monitoring/grafana-dashboard-nodes.yaml   |  2 +-
 .../grafana-dashboard-overview.yaml           |  8 +++----
 .../monitoring/grafana-dashboard-pods.yaml    |  2 +-
 9 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index bf06d40..33b388d 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -79,7 +79,7 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
 WORKER_TOTAL = len(WORKER_NODES)
 CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
 WORKER_SUFFIX = f"/{WORKER_TOTAL}"
-CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring"
+CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
 LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
 GAUGE_WIDTHS = [5, 5, 5, 5, 4]
 
@@ -271,10 +271,21 @@ NET_CLUSTER_TX = (
     'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
     " or on() vector(0)"
 )
-NET_TOTAL_EXPR = NET_CLUSTER_TX
-NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS
-NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS
-NET_INTERNAL_EXPR = f"clamp_min((({NET_CLUSTER_RX}) + ({NET_CLUSTER_TX})) - (({TRAEFIK_NET_INGRESS}) + ({TRAEFIK_NET_EGRESS})), 0)"
+PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
+NET_NODE_RX_PHYS = (
+    f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
+)
+NET_NODE_TX_PHYS = (
+    f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
+)
+NET_TOTAL_EXPR = NET_NODE_TX_PHYS
+NET_INGRESS_EXPR = NET_NODE_RX_PHYS
+NET_EGRESS_EXPR = NET_NODE_TX_PHYS
+NET_INTERNAL_EXPR = (
+    'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]) '
+    '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]))'
+    ' or on() vector(0)'
+)
 
 # ---------------------------------------------------------------------------
 # Panel factories
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 8a8b8f4..ca671c8 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -80,7 +80,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -140,7 +140,7 @@
       },
       "targets": [
         {
-          "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json
index e974d8a..3cf784f 100644
--- a/services/monitoring/dashboards/atlas-nodes.json
+++ b/services/monitoring/dashboards/atlas-nodes.json
@@ -142,7 +142,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 4cd4b29..156d96f 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -137,7 +137,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
           "refId": "A"
         }
       ],
@@ -1228,7 +1228,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Ingress (Traefik)"
         }
@@ -1272,7 +1272,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Egress (Traefik)"
         }
@@ -1316,7 +1316,7 @@
       },
       "targets": [
         {
-          "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Internal traffic"
         }
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index 8494e89..f519d14 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -200,7 +200,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 1727e6a..fa5b742 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -89,7 +89,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -149,7 +149,7 @@ data:
           },
           "targets": [
             {
-              "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml
index afbeb3c..c78e994 100644
--- a/services/monitoring/grafana-dashboard-nodes.yaml
+++ b/services/monitoring/grafana-dashboard-nodes.yaml
@@ -151,7 +151,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 99d6d46..957bb6a 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -146,7 +146,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
               "refId": "A"
             }
           ],
@@ -1237,7 +1237,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Ingress (Traefik)"
             }
@@ -1281,7 +1281,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Egress (Traefik)"
             }
@@ -1325,7 +1325,7 @@ data:
           },
           "targets": [
             {
-              "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Internal traffic"
             }
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index e160eca..78beca5 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -209,7 +209,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})",
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
               "refId": "A"
             }
           ],

From e4f93e85d25a57b8cb9521e9dcd80b34ca869e0b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 17:09:13 -0300
Subject: [PATCH 53/71] monitoring: control-plane stat and namespace share
 tweaks

---
 scripts/render_dashboards.py                  | 48 ++++++++++++-------
 .../monitoring/dashboards/atlas-network.json  |  2 +-
 .../monitoring/dashboards/atlas-overview.json | 29 ++++++-----
 .../monitoring/grafana-dashboard-network.yaml |  2 +-
 .../grafana-dashboard-overview.yaml           | 29 ++++++-----
 5 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py
index 33b388d..812a931 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/render_dashboards.py
@@ -82,6 +82,9 @@ WORKER_SUFFIX = f"/{WORKER_TOTAL}"
 CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
 LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
 GAUGE_WIDTHS = [5, 5, 5, 5, 4]
+CONTROL_WORKLOADS_EXPR = (
+    f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
+)
 
 # ---------------------------------------------------------------------------
 # PromQL helpers
@@ -168,7 +171,7 @@ def node_io_expr(scope=""):
 
 def namespace_share_expr(resource_expr):
     selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
-    total = f"clamp_min(sum( {resource_expr} ), 1)"
+    total = f"clamp_min(sum( {selected} ), 1)"
     return f"100 * ( {selected} ) / {total}"
 
 
@@ -282,8 +285,8 @@ NET_TOTAL_EXPR = NET_NODE_TX_PHYS
 NET_INGRESS_EXPR = NET_NODE_RX_PHYS
 NET_EGRESS_EXPR = NET_NODE_TX_PHYS
 NET_INTERNAL_EXPR = (
-    'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]) '
-    '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]))'
+    'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
+    '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
     ' or on() vector(0)'
 )
 
@@ -529,9 +532,9 @@ def build_overview():
         (
             3,
             "Control plane workloads",
-            f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
+            CONTROL_WORKLOADS_EXPR,
             None,
-            1,
+            4,
             link_to("atlas-pods"),
         ),
         (
@@ -600,18 +603,31 @@ def build_overview():
                 ],
             }
         width, x = gauge_grid(idx)
-        panels.append(
-            gauge_panel(
-                panel_id,
-                title,
-                expr,
-                {"h": 5, "w": width, "x": x, "y": 0},
-                min_value=min_value,
-                max_value=max_value,
-                thresholds=thresholds,
-                links=links,
+        if panel_id == 3:
+            panels.append(
+                stat_panel(
+                    panel_id,
+                    title,
+                    expr,
+                    {"h": 5, "w": width, "x": x, "y": 0},
+                    thresholds=thresholds,
+                    legend=None,
+                    links=links,
+                )
+            )
+        else:
+            panels.append(
+                gauge_panel(
+                    panel_id,
+                    title,
+                    expr,
+                    {"h": 5, "w": width, "x": x, "y": 0},
+                    min_value=min_value,
+                    max_value=max_value,
+                    thresholds=thresholds,
+                    links=links,
+                )
             )
-        )
 
     hottest = [
         (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index ca671c8..9005eb9 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -140,7 +140,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 156d96f..93a246b 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -123,7 +123,7 @@
     },
     {
       "id": 3,
-      "type": "gauge",
+      "type": "stat",
       "title": "Control plane workloads",
       "datasource": {
         "type": "prometheus",
@@ -137,14 +137,16 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
+          "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)",
           "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "min": 0,
-          "max": 4,
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -165,11 +167,18 @@
                 "value": 3
               }
             ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -177,9 +186,7 @@
           "fields": "",
           "values": false
         },
-        "orientation": "auto",
-        "showThresholdMarkers": false,
-        "showThresholdLabels": false
+        "textMode": "value"
       },
       "links": [
         {
@@ -901,7 +908,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -954,7 +961,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)",
+          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1007,7 +1014,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1316,7 +1323,7 @@
       },
       "targets": [
         {
-          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+          "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
           "refId": "A",
           "legendFormat": "Internal traffic"
         }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index fa5b742..d2372de 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -149,7 +149,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
               "refId": "A"
             }
           ],
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 957bb6a..ebd9b2b 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -132,7 +132,7 @@ data:
         },
         {
           "id": 3,
-          "type": "gauge",
+          "type": "stat",
           "title": "Control plane workloads",
           "datasource": {
             "type": "prometheus",
@@ -146,14 +146,16 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
+              "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)",
               "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "min": 0,
-              "max": 4,
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -174,11 +176,18 @@ data:
                     "value": 3
                   }
                 ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -186,9 +195,7 @@ data:
               "fields": "",
               "values": false
             },
-            "orientation": "auto",
-            "showThresholdMarkers": false,
-            "showThresholdLabels": false
+            "textMode": "value"
           },
           "links": [
             {
@@ -910,7 +917,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -963,7 +970,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)",
+              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1016,7 +1023,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
+              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1325,7 +1332,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
+              "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
               "refId": "A",
               "legendFormat": "Internal traffic"
             }

From 630f1f2a810c02f847474a2ff62a5c5079981a8f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 19:43:19 -0300
Subject: [PATCH 54/71] traefik: extend upload timeouts

---
 infrastructure/traefik/deployment.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/infrastructure/traefik/deployment.yaml b/infrastructure/traefik/deployment.yaml
index ba16909..196954c 100644
--- a/infrastructure/traefik/deployment.yaml
+++ b/infrastructure/traefik/deployment.yaml
@@ -39,6 +39,14 @@ items:
           - --metrics.prometheus.addEntryPointsLabels=true
           - --metrics.prometheus.addRoutersLabels=true
           - --metrics.prometheus.addServicesLabels=true
+          - --entrypoints.web.forwardingTimeouts.dialTimeout=120s
+          - --entrypoints.web.forwardingTimeouts.responseHeaderTimeout=10m
+          - --entrypoints.web.transport.respondingTimeouts.readTimeout=0
+          - --entrypoints.web.transport.respondingTimeouts.idleTimeout=0
+          - --entrypoints.websecure.forwardingTimeouts.dialTimeout=120s
+          - --entrypoints.websecure.forwardingTimeouts.responseHeaderTimeout=10m
+          - --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0
+          - --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0
           - --entrypoints.metrics.address=:9100
           - --metrics.prometheus.entryPoint=metrics
           image: traefik:v3.3.3

From 75f6a593160ec50258ff92dd0f8532b5a3137608 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 18 Nov 2025 20:01:16 -0300
Subject: [PATCH 55/71] traefik: use responding timeouts only

---
 infrastructure/traefik/deployment.yaml | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/infrastructure/traefik/deployment.yaml b/infrastructure/traefik/deployment.yaml
index 196954c..a34307a 100644
--- a/infrastructure/traefik/deployment.yaml
+++ b/infrastructure/traefik/deployment.yaml
@@ -39,14 +39,12 @@ items:
           - --metrics.prometheus.addEntryPointsLabels=true
           - --metrics.prometheus.addRoutersLabels=true
           - --metrics.prometheus.addServicesLabels=true
-          - --entrypoints.web.forwardingTimeouts.dialTimeout=120s
-          - --entrypoints.web.forwardingTimeouts.responseHeaderTimeout=10m
-          - --entrypoints.web.transport.respondingTimeouts.readTimeout=0
-          - --entrypoints.web.transport.respondingTimeouts.idleTimeout=0
-          - --entrypoints.websecure.forwardingTimeouts.dialTimeout=120s
-          - --entrypoints.websecure.forwardingTimeouts.responseHeaderTimeout=10m
-          - --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0
-          - --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0
+          - --entrypoints.web.transport.respondingTimeouts.readTimeout=0s
+          - --entrypoints.web.transport.respondingTimeouts.writeTimeout=0s
+          - --entrypoints.web.transport.respondingTimeouts.idleTimeout=0s
+          - --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0s
+          - --entrypoints.websecure.transport.respondingTimeouts.writeTimeout=0s
+          - --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0s
           - --entrypoints.metrics.address=:9100
           - --metrics.prometheus.entryPoint=metrics
           image: traefik:v3.3.3

From d99bb06eeb9474d3522c2f72e0fb7bc20d8be86b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 20 Nov 2025 13:11:13 -0300
Subject: [PATCH 56/71] monitoring: reenable dcgm exporter

---
 services/monitoring/dcgm-exporter.yaml | 2 +-
 services/monitoring/kustomization.yaml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index eaa3930..9a4a1d4 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -35,7 +35,7 @@ spec:
         - operator: Exists
       containers:
         - name: dcgm-exporter
-          image: docker.io/nvidia/dcgm-exporter:3.3.5-1-ubuntu22.04
+          image: registry.bstein.dev/monitoring/dcgm:4.4.2-1-ubuntu22.04
           imagePullPolicy: IfNotPresent
           ports:
             - name: metrics
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 76263c1..3164862 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -10,5 +10,6 @@ resources:
   - grafana-dashboard-nodes.yaml
   - grafana-dashboard-storage.yaml
   - grafana-dashboard-network.yaml
+  - dcgm-exporter.yaml
   - grafana-folders.yaml
   - helmrelease.yaml

From 5b89b0533e96cf37f87591afd356f3da627b341a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 11:54:53 -0300
Subject: [PATCH 57/71] monitoring: use mirrored dcgm-exporter tag

---
 services/monitoring/README.md          | 12 ++++++++++++
 services/monitoring/dcgm-exporter.yaml |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/services/monitoring/README.md b/services/monitoring/README.md
index 74baf08..0e8885a 100644
--- a/services/monitoring/README.md
+++ b/services/monitoring/README.md
@@ -13,3 +13,15 @@ kubectl create secret generic grafana-admin \
 ```
 
 Update the password whenever you rotate credentials.
+
+## DCGM exporter image
+
+The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions:
+
+```bash
+skopeo copy \
+  docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
+  docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
+```
+
+When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`.
diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 9a4a1d4..766cf7b 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -35,7 +35,7 @@ spec:
         - operator: Exists
       containers:
         - name: dcgm-exporter
-          image: registry.bstein.dev/monitoring/dcgm:4.4.2-1-ubuntu22.04
+          image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
           imagePullPolicy: IfNotPresent
           ports:
             - name: metrics

From d87a1dbc473230e57d6fcf229121ad7c8b20b302 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 11:59:55 -0300
Subject: [PATCH 58/71] monitoring: allow dcgm rollout with unavailable node

---
 services/monitoring/dcgm-exporter.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 766cf7b..53f4e28 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -10,6 +10,9 @@ spec:
   selector:
     matchLabels:
       app: dcgm-exporter
+  updateStrategy:
+    rollingUpdate:
+      maxUnavailable: 2
   template:
     metadata:
       labels:

From ee923df567e1e79986f1ccde2b03922b0927a542 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 12:07:11 -0300
Subject: [PATCH 59/71] monitoring: add registry pull secret for dcgm-exporter

---
 services/monitoring/dcgm-exporter.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 53f4e28..f23d742 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -55,6 +55,8 @@ spec:
           volumeMounts:
             - name: pod-resources
               mountPath: /var/lib/kubelet/pod-resources
+      imagePullSecrets:
+        - name: zot-regcred
       volumes:
         - name: pod-resources
           hostPath:

From a18c3e6f67acb19d9104b46ea21f1fd7283029a4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 12:19:16 -0300
Subject: [PATCH 60/71] monitoring: always pull dcgm-exporter tag

---
 services/monitoring/dcgm-exporter.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index f23d742..3dd6f34 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -39,7 +39,7 @@ spec:
       containers:
         - name: dcgm-exporter
           image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
-          imagePullPolicy: IfNotPresent
+          imagePullPolicy: Always
           ports:
             - name: metrics
               containerPort: 9400

From e26ef44d1a6072d395a231c741a5d229e611a576 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 12:25:30 -0300
Subject: [PATCH 61/71] monitoring: run dcgm-exporter with nvidia runtime

---
 services/monitoring/dcgm-exporter.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 3dd6f34..06152e7 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -22,6 +22,7 @@ spec:
         prometheus.io/port: "9400"
     spec:
       serviceAccountName: default
+      runtimeClassName: nvidia
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:

From f1d0970aa05300f1704e0b415188c2f0abb9ee8c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 12:36:24 -0300
Subject: [PATCH 62/71] monitoring: mirror dcgm-exporter as multi-arch

---
 services/monitoring/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/monitoring/README.md b/services/monitoring/README.md
index 0e8885a..835ae1d 100644
--- a/services/monitoring/README.md
+++ b/services/monitoring/README.md
@@ -20,6 +20,7 @@ The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-ex
 
 ```bash
 skopeo copy \
+  --all \
   docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \
   docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04
 ```

From eed67b3db01c493bce6ba5aa6ef8e70d1ec4a041 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 13:16:00 -0300
Subject: [PATCH 63/71] monitoring: regen dashboards with gpu details

---
 AGENTS.md                                     |  42 ++++
 ...shboards.py => dashboards_render_atlas.py} | 177 ++++++++++++----
 services/monitoring/dashboards/atlas-gpu.json | 184 +++++++++++++++++
 .../monitoring/dashboards/atlas-network.json  |   5 +-
 .../monitoring/dashboards/atlas-overview.json |  14 +-
 .../monitoring/grafana-dashboard-gpu.yaml     | 193 ++++++++++++++++++
 .../monitoring/grafana-dashboard-network.yaml |   5 +-
 .../grafana-dashboard-overview.yaml           |  14 +-
 services/monitoring/helmrelease.yaml          |   9 +
 services/monitoring/kustomization.yaml        |   1 +
 10 files changed, 584 insertions(+), 60 deletions(-)
 create mode 100644 AGENTS.md
 rename scripts/{render_dashboards.py => dashboards_render_atlas.py} (90%)
 create mode 100644 services/monitoring/dashboards/atlas-gpu.json
 create mode 100644 services/monitoring/grafana-dashboard-gpu.yaml

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..05838aa
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,42 @@
+
+
+Repository Guidelines
+
+## Project Structure & Module Organization
+- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout.
+- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused.
+- `dockerfiles/` hosts bespoke images, while `scripts/` stores operational Fish/Bash helpers—extend these directories instead of relying on ad-hoc commands.
+
+## Build, Test, and Development Commands
+- `kustomize build services/<app>` (or `kubectl kustomize ...`) renders manifests exactly as Flux will.
+- `kubectl apply --server-side --dry-run=client -k services/<app>` checks schema compatibility without touching the cluster.
+- `flux reconcile kustomization <name> --namespace flux-system --with-source` pulls the latest Git state after merges or hotfixes.
+- `fish scripts/flux_hammer.fish --help` explains the recovery tool; read it before running against production workloads.
+
+## Coding Style & Naming Conventions
+- YAML uses two-space indents; retain the leading path comment (e.g. `# services/gitea/deployment.yaml`) to speed code review.
+- Keep resource names lowercase kebab-case, align labels/selectors, and mirror namespaces with directory names.
+- List resources in `kustomization.yaml` from namespace/config, through storage, then workloads and networking for predictable diffs.
+- Scripts start with `#!/usr/bin/env fish` or bash, stay executable, and follow snake_case names such as `flux_hammer.fish`.
+
+## Testing Guidelines
+- Run `kustomize build` and the dry-run apply for every service you touch; capture failures before opening a PR.
+- `flux diff kustomization <name> --path services/<app>` previews reconciliations—link notable output when behavior shifts.
+- Docker edits: `docker build -f dockerfiles/Dockerfile.monerod .` (swap the file you changed) to verify image builds.
+
+## Commit & Pull Request Guidelines
+- Keep commit subjects short, present-tense, and optionally scoped (`gpu(titan-24): add RuntimeClass`); squash fixups before review.
+- Describe linked issues, affected services, and required operator steps (e.g. `flux reconcile kustomization services-gitea`) in the PR body.
+- Focus each PR on one kustomization or service and update `infrastructure/flux-system` when Flux must track new folders.
+- Record the validation you ran (dry-runs, diffs, builds) and add screenshots only when ingress or UI behavior changes.
+
+## Security & Configuration Tips
+- Never commit credentials; use Vault workflows (`services/vault/`) or SOPS-encrypted manifests wired through `infrastructure/flux-system`.
+- Node selectors and tolerations gate workloads to hardware like `hardware: rpi4`; confirm labels before scaling or renaming nodes.
+- Pin external images by digest or rely on Flux image automation to follow approved tags and avoid drift.
+
+## Dashboard roadmap / context (2025-12-02)
+- Atlas dashboards are generated via `scripts/dashboards_render_atlas.py --build`, which writes JSON under `services/monitoring/dashboards/` and ConfigMaps under `services/monitoring/`. Keep the Grafana manifests in sync by regenerating after edits.
+- Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie.
+- Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned.
+- Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview.
diff --git a/scripts/render_dashboards.py b/scripts/dashboards_render_atlas.py
similarity index 90%
rename from scripts/render_dashboards.py
rename to scripts/dashboards_render_atlas.py
index 812a931..97070d2 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/dashboards_render_atlas.py
@@ -2,8 +2,8 @@
 """Generate Atlas Grafana dashboards and render them into ConfigMaps.
 
 Usage:
-  scripts/render_dashboards.py --build   # rebuild JSON + ConfigMaps
-  scripts/render_dashboards.py           # re-render ConfigMaps from JSON
+  scripts/dashboards_render_atlas.py --build   # rebuild JSON + ConfigMaps
+  scripts/dashboards_render_atlas.py           # re-render ConfigMaps from JSON
 """
 
 import argparse
@@ -198,7 +198,6 @@ STUCK_TERMINATING_EXPR = (
     ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
     '))'
 )
-
 PROBLEM_TABLE_EXPR = (
     "(time() - kube_pod_created{pod!=\"\"}) "
     "* on(namespace,pod) group_left(node) kube_pod_info "
@@ -489,6 +488,47 @@ def pie_panel(panel_id, title, expr, grid):
     }
 
 
+def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
+    """Return a bar gauge panel with label-aware reduction."""
+    panel = {
+        "id": panel_id,
+        "type": "bargauge",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
+        "fieldConfig": {
+            "defaults": {
+                "unit": unit,
+                "min": 0,
+                "max": 100 if unit == "percent" else None,
+                "thresholds": {
+                    "mode": "absolute",
+                    "steps": [
+                        {"color": "green", "value": None},
+                        {"color": "yellow", "value": 50},
+                        {"color": "orange", "value": 70},
+                        {"color": "red", "value": 85},
+                    ],
+                },
+            },
+            "overrides": [],
+        },
+        "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+                "calcs": ["lastNotNull"],
+                "fields": "/.*/",
+                "values": False,
+            },
+        },
+    }
+    if links:
+        panel["links"] = links
+    return panel
+
+
 def text_panel(panel_id, title, content, grid):
     return {
         "id": panel_id,
@@ -554,6 +594,7 @@ def build_overview():
             link_to("atlas-pods"),
         ),
     ]
+
     def gauge_grid(idx):
         width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
         x = sum(GAUGE_WIDTHS[:idx])
@@ -806,38 +847,14 @@ def build_overview():
         )
     )
     panels.append(
-        {
-            "id": 22,
-            "type": "bargauge",
-            "title": "Nodes closest to full root disks",
-            "datasource": PROM_DS,
-            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47},
-            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
-            "fieldConfig": {
-                "defaults": {
-                    "unit": "percent",
-                    "min": 0,
-                    "max": 100,
-                    "thresholds": {
-                        "mode": "absolute",
-                        "steps": [
-                            {"color": "green", "value": None},
-                            {"color": "yellow", "value": 50},
-                            {"color": "orange", "value": 70},
-                            {"color": "red", "value": 85},
-                        ],
-                    },
-                },
-                "overrides": [],
-            },
-            "options": {
-                "displayMode": "gradient",
-                "orientation": "horizontal",
-                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
-            },
-            "links": link_to("atlas-storage"),
-            "transformations": [{"id": "labelsToFields", "options": {}}],
-        }
+        bargauge_panel(
+            22,
+            "Nodes closest to full root disks",
+            f"topk(8, {root_usage_expr()})",
+            {"h": 8, "w": 12, "x": 12, "y": 47},
+            unit="percent",
+            links=link_to("atlas-storage"),
+        )
     )
 
     return {
@@ -857,6 +874,7 @@ def build_overview():
             {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
             {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
             {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
+            {"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
         ],
     }
 
@@ -1179,13 +1197,31 @@ def build_storage_dashboard():
 def build_network_dashboard():
     panels = []
     panels.append(
-        stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps")
+        stat_panel(
+            1,
+            "Ingress traffic",
+            NET_INGRESS_EXPR,
+            {"h": 4, "w": 8, "x": 0, "y": 0},
+            unit="Bps",
+        )
     )
     panels.append(
-        stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
+        stat_panel(
+            2,
+            "Egress traffic",
+            NET_EGRESS_EXPR,
+            {"h": 4, "w": 8, "x": 8, "y": 0},
+            unit="Bps",
+        )
     )
     panels.append(
-        stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps")
+        stat_panel(
+            3,
+            "Intra-cluster traffic",
+            NET_INTERNAL_EXPR,
+            {"h": 4, "w": 8, "x": 16, "y": 0},
+            unit="Bps",
+        )
     )
     panels.append(
         stat_panel(
@@ -1195,14 +1231,13 @@ def build_network_dashboard():
             {"h": 4, "w": 8, "x": 0, "y": 4},
             unit="req/s",
             legend="{{router}}",
-            instant=True,
         )
     )
     panels.append(
         timeseries_panel(
             5,
             "Per-node throughput",
-            node_net_expr(),
+            f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
             {"h": 8, "w": 24, "x": 0, "y": 8},
             unit="Bps",
             legend="{{node}}",
@@ -1270,6 +1305,64 @@ def build_network_dashboard():
     }
 
 
+def build_gpu_dashboard():
+    panels = []
+    panels.append(
+        pie_panel(
+            1,
+            "Namespace GPU share",
+            namespace_gpu_share_expr(),
+            {"h": 8, "w": 12, "x": 0, "y": 0},
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            2,
+            "GPU util by namespace",
+            NAMESPACE_GPU_USAGE,
+            {"h": 8, "w": 12, "x": 12, "y": 0},
+            unit="percent",
+            legend="{{namespace}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            3,
+            "GPU util by node",
+            'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
+            {"h": 8, "w": 12, "x": 0, "y": 8},
+            unit="percent",
+            legend="{{Hostname}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        table_panel(
+            4,
+            "Top pods by GPU util",
+            'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
+            {"h": 8, "w": 12, "x": 12, "y": 8},
+            unit="percent",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    return {
+        "uid": "atlas-gpu",
+        "title": "Atlas GPU",
+        "folderUid": PRIVATE_FOLDER,
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-12h", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "gpu"],
+    }
+
+
 DASHBOARDS = {
     "atlas-overview": {
         "builder": build_overview,
@@ -1291,6 +1384,10 @@ DASHBOARDS = {
         "builder": build_network_dashboard,
         "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
     },
+    "atlas-gpu": {
+        "builder": build_gpu_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
+    },
 }
 
 
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
new file mode 100644
index 0000000..da235a5
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -0,0 +1,184 @@
+{
+  "uid": "atlas-gpu",
+  "title": "Atlas GPU",
+  "folderUid": "atlas-internal",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "piechart",
+      "title": "Namespace GPU share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "displayLabels": [
+          "percent"
+        ],
+        "tooltip": {
+          "mode": "single"
+        },
+        "colorScheme": "interpolateSpectral",
+        "colorBy": "value",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "GPU util by namespace",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "GPU util by node",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+          "refId": "A",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 4,
+      "type": "table",
+      "title": "Top pods by GPU util",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    }
+  ],
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "gpu"
+  ]
+}
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 9005eb9..f2291b7 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -202,8 +202,7 @@
         {
           "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
           "refId": "A",
-          "legendFormat": "{{router}}",
-          "instant": true
+          "legendFormat": "{{router}}"
         }
       ],
       "fieldConfig": {
@@ -262,7 +261,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 93a246b..4e3c357 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1456,7 +1456,7 @@
           "calcs": [
             "lastNotNull"
           ],
-          "fields": "",
+          "fields": "/.*/",
           "values": false
         }
       },
@@ -1466,12 +1466,6 @@
           "url": "/d/atlas-storage",
           "targetBlank": true
         }
-      ],
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        }
       ]
     }
   ],
@@ -1512,6 +1506,12 @@
       "type": "dashboard",
       "dashboardUid": "atlas-network",
       "keepTime": false
+    },
+    {
+      "title": "Atlas GPU",
+      "type": "dashboard",
+      "dashboardUid": "atlas-gpu",
+      "keepTime": false
     }
   ]
 }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
new file mode 100644
index 0000000..13262d6
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -0,0 +1,193 @@
+# services/monitoring/grafana-dashboard-gpu.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-gpu
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-gpu.json: |
+    {
+      "uid": "atlas-gpu",
+      "title": "Atlas GPU",
+      "folderUid": "atlas-internal",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "piechart",
+          "title": "Namespace GPU share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "color": {
+                "mode": "palette-classic"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "displayLabels": [
+              "percent"
+            ],
+            "tooltip": {
+              "mode": "single"
+            },
+            "colorScheme": "interpolateSpectral",
+            "colorBy": "value",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 2,
+          "type": "timeseries",
+          "title": "GPU util by namespace",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 3,
+          "type": "timeseries",
+          "title": "GPU util by node",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+              "refId": "A",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 4,
+          "type": "table",
+          "title": "Top pods by GPU util",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        }
+      ],
+      "time": {
+        "from": "now-12h",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "gpu"
+      ]
+    }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index d2372de..4b78fb9 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -211,8 +211,7 @@ data:
             {
               "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
               "refId": "A",
-              "legendFormat": "{{router}}",
-              "instant": true
+              "legendFormat": "{{router}}"
             }
           ],
           "fieldConfig": {
@@ -271,7 +270,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index ebd9b2b..512adf9 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1465,7 +1465,7 @@ data:
               "calcs": [
                 "lastNotNull"
               ],
-              "fields": "",
+              "fields": "/.*/",
               "values": false
             }
           },
@@ -1475,12 +1475,6 @@ data:
               "url": "/d/atlas-storage",
               "targetBlank": true
             }
-          ],
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            }
           ]
         }
       ],
@@ -1521,6 +1515,12 @@ data:
           "type": "dashboard",
           "dashboardUid": "atlas-network",
           "keepTime": false
+        },
+        {
+          "title": "Atlas GPU",
+          "type": "dashboard",
+          "dashboardUid": "atlas-gpu",
+          "keepTime": false
         }
       ]
     }
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 5a8f1ba..cf56b27 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -320,6 +320,14 @@ spec:
             editable: true
             options:
               path: /var/lib/grafana/dashboards/storage
+          - name: gpu
+            orgId: 1
+            folder: Atlas Internal
+            type: file
+            disableDeletion: false
+            editable: true
+            options:
+              path: /var/lib/grafana/dashboards/gpu
           - name: network
             orgId: 1
             folder: Atlas Internal
@@ -333,6 +341,7 @@ spec:
       pods: grafana-dashboard-pods
       nodes: grafana-dashboard-nodes
       storage: grafana-dashboard-storage
+      gpu: grafana-dashboard-gpu
       network: grafana-dashboard-network
     extraConfigmapMounts:
       - name: grafana-folders
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 3164862..a50a1c1 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -10,6 +10,7 @@ resources:
   - grafana-dashboard-nodes.yaml
   - grafana-dashboard-storage.yaml
   - grafana-dashboard-network.yaml
+  - grafana-dashboard-gpu.yaml
   - dcgm-exporter.yaml
   - grafana-folders.yaml
   - helmrelease.yaml

From a3dc9391eef5011902e410cab32246ee9d354ca9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 14:41:39 -0300
Subject: [PATCH 64/71] monitoring: polish dashboards and folders

---
 scripts/dashboards_render_atlas.py            | 119 +++++++++---------
 services/monitoring/dashboards/atlas-gpu.json |  10 +-
 .../monitoring/dashboards/atlas-network.json  |  18 +--
 .../monitoring/dashboards/atlas-nodes.json    |  12 +-
 .../monitoring/dashboards/atlas-overview.json |  85 +++++++------
 .../monitoring/dashboards/atlas-pods.json     |   8 +-
 .../monitoring/dashboards/atlas-storage.json  |  16 +--
 .../monitoring/grafana-dashboard-gpu.yaml     |  10 +-
 .../monitoring/grafana-dashboard-network.yaml |  18 +--
 .../monitoring/grafana-dashboard-nodes.yaml   |  12 +-
 .../grafana-dashboard-overview.yaml           |  85 +++++++------
 .../monitoring/grafana-dashboard-pods.yaml    |   8 +-
 .../monitoring/grafana-dashboard-storage.yaml |  16 +--
 services/monitoring/grafana-folders.yaml      |  11 +-
 services/monitoring/helmrelease.yaml          |   4 +-
 15 files changed, 238 insertions(+), 194 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 97070d2..11bd2c8 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -32,7 +32,7 @@ data:
 )
 
 PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
-PUBLIC_FOLDER = "atlas-overview"
+PUBLIC_FOLDER = "overview"
 PRIVATE_FOLDER = "atlas-internal"
 
 PERCENT_THRESHOLDS = {
@@ -231,10 +231,13 @@ NAMESPACE_GPU_ALLOC = (
     'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
     ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
-NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
+NAMESPACE_GPU_USAGE_SHARE = (
+    'avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)'
+)
+NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
 NAMESPACE_GPU_RAW = (
     "("
-    + NAMESPACE_GPU_USAGE
+    + NAMESPACE_GPU_USAGE_SHARE
     + ") or on(namespace) ("
     + NAMESPACE_CPU_RAW
     + " * 0)"
@@ -519,7 +522,7 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
             "orientation": "horizontal",
             "reduceOptions": {
                 "calcs": ["lastNotNull"],
-                "fields": "/.*/",
+                "fields": "Value",
                 "values": False,
             },
         },
@@ -555,7 +558,7 @@ def build_overview():
     row1_stats = [
         (
             1,
-            "Workers ready",
+            "Workers Ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
             WORKER_SUFFIX,
             WORKER_TOTAL,
@@ -563,7 +566,7 @@ def build_overview():
         ),
         (
             2,
-            "Control plane ready",
+            "Control Plane Ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
             CONTROL_SUFFIX,
             CONTROL_TOTAL,
@@ -571,7 +574,7 @@ def build_overview():
         ),
         (
             3,
-            "Control plane workloads",
+            "Control Plane Workloads",
             CONTROL_WORKLOADS_EXPR,
             None,
             4,
@@ -579,7 +582,7 @@ def build_overview():
         ),
         (
             4,
-            "Problem pods",
+            "Problem Pods",
             PROBLEM_PODS_EXPR,
             None,
             1,
@@ -587,7 +590,7 @@ def build_overview():
         ),
         (
             5,
-            "Stuck terminating",
+            "Stuck Terminating",
             STUCK_TERMINATING_EXPR,
             None,
             1,
@@ -644,7 +647,7 @@ def build_overview():
                 ],
             }
         width, x = gauge_grid(idx)
-        if panel_id == 3:
+        if panel_id in (3, 4, 5):
             panels.append(
                 stat_panel(
                     panel_id,
@@ -654,6 +657,7 @@ def build_overview():
                     thresholds=thresholds,
                     legend=None,
                     links=links,
+                    text_mode="value",
                 )
             )
         else:
@@ -693,10 +697,10 @@ def build_overview():
         )
 
     storage_panels = [
-        (23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
-        (24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
-        (25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
-        (26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
+        (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
+        (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
+        (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
+        (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
     ]
     for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
         panels.append(
@@ -714,7 +718,7 @@ def build_overview():
     panels.append(
         pie_panel(
             11,
-            "Namespace CPU share",
+            "Namespace CPU Share",
             namespace_cpu_share_expr(),
             {"h": 9, "w": 8, "x": 0, "y": 16},
         )
@@ -722,7 +726,7 @@ def build_overview():
     panels.append(
         pie_panel(
             12,
-            "Namespace GPU share",
+            "Namespace GPU Share",
             namespace_gpu_share_expr(),
             {"h": 9, "w": 8, "x": 8, "y": 16},
         )
@@ -730,7 +734,7 @@ def build_overview():
     panels.append(
         pie_panel(
             13,
-            "Namespace RAM share",
+            "Namespace RAM Share",
             namespace_ram_share_expr(),
             {"h": 9, "w": 8, "x": 16, "y": 16},
         )
@@ -740,7 +744,7 @@ def build_overview():
     panels.append(
         timeseries_panel(
             14,
-            "Worker node CPU",
+            "Worker Node CPU",
             node_cpu_expr(worker_filter),
             {"h": 8, "w": 12, "x": 0, "y": 32},
             unit="percent",
@@ -754,7 +758,7 @@ def build_overview():
     panels.append(
         timeseries_panel(
             15,
-            "Worker node RAM",
+            "Worker Node RAM",
             node_mem_expr(worker_filter),
             {"h": 8, "w": 12, "x": 12, "y": 32},
             unit="percent",
@@ -794,7 +798,7 @@ def build_overview():
     panels.append(
         timeseries_panel(
             18,
-            "Cluster ingress throughput",
+            "Cluster Ingress Throughput",
             NET_INGRESS_EXPR,
             {"h": 7, "w": 8, "x": 0, "y": 25},
             unit="Bps",
@@ -807,7 +811,7 @@ def build_overview():
     panels.append(
         timeseries_panel(
             19,
-            "Cluster egress throughput",
+            "Cluster Egress Throughput",
             NET_EGRESS_EXPR,
             {"h": 7, "w": 8, "x": 8, "y": 25},
             unit="Bps",
@@ -820,7 +824,7 @@ def build_overview():
     panels.append(
         timeseries_panel(
             20,
-            "Intra-cluster throughput",
+            "Intra-Cluster Throughput",
             NET_INTERNAL_EXPR,
             {"h": 7, "w": 8, "x": 16, "y": 25},
             unit="Bps",
@@ -834,7 +838,7 @@ def build_overview():
     panels.append(
         timeseries_panel(
             21,
-            "Root filesystem usage",
+            "Root Filesystem Usage",
             root_usage_expr(),
             {"h": 8, "w": 12, "x": 0, "y": 47},
             unit="percent",
@@ -849,7 +853,7 @@ def build_overview():
     panels.append(
         bargauge_panel(
             22,
-            "Nodes closest to full root disks",
+            "Nodes Closest to Full Root Disks",
             f"topk(8, {root_usage_expr()})",
             {"h": 8, "w": 12, "x": 12, "y": 47},
             unit="percent",
@@ -868,7 +872,8 @@ def build_overview():
         "style": "dark",
         "tags": ["atlas", "overview"],
         "templating": {"list": []},
-        "time": {"from": "now-12h", "to": "now"},
+        "time": {"from": "now-1h", "to": "now"},
+        "refresh": "1m",
         "links": [
             {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
             {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
@@ -884,7 +889,7 @@ def build_pods_dashboard():
     panels.append(
         stat_panel(
             1,
-            "Problem pods",
+            "Problem Pods",
             PROBLEM_PODS_EXPR,
             {"h": 4, "w": 6, "x": 0, "y": 0},
             thresholds={
@@ -914,7 +919,7 @@ def build_pods_dashboard():
     panels.append(
         stat_panel(
             3,
-            "Stuck terminating (>10m)",
+            "Stuck Terminating (>10m)",
             STUCK_TERMINATING_EXPR,
             {"h": 4, "w": 6, "x": 12, "y": 0},
             thresholds={
@@ -929,7 +934,7 @@ def build_pods_dashboard():
     panels.append(
         stat_panel(
             4,
-            "Control plane workloads",
+            "Control Plane Workloads",
             f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
             {"h": 4, "w": 6, "x": 18, "y": 0},
             thresholds={
@@ -945,7 +950,7 @@ def build_pods_dashboard():
     panels.append(
         table_panel(
             5,
-            "Pods not running",
+            "Pods Not Running",
             PROBLEM_TABLE_EXPR,
             {"h": 10, "w": 24, "x": 0, "y": 4},
             unit="s",
@@ -994,7 +999,7 @@ def build_nodes_dashboard():
     panels.append(
         stat_panel(
             1,
-            "Worker nodes ready",
+            "Worker Nodes Ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
             {"h": 4, "w": 8, "x": 0, "y": 0},
             value_suffix=WORKER_SUFFIX,
@@ -1003,7 +1008,7 @@ def build_nodes_dashboard():
     panels.append(
         stat_panel(
             2,
-            "Control plane ready",
+            "Control Plane Ready",
             f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
             {"h": 4, "w": 8, "x": 8, "y": 0},
             value_suffix=CONTROL_SUFFIX,
@@ -1012,7 +1017,7 @@ def build_nodes_dashboard():
     panels.append(
         stat_panel(
             3,
-            "Control plane workloads",
+            "Control Plane Workloads",
             f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
             {"h": 4, "w": 8, "x": 16, "y": 0},
         )
@@ -1046,7 +1051,7 @@ def build_nodes_dashboard():
     panels.append(
         timeseries_panel(
             6,
-            "Control plane (incl. titan-db) CPU",
+            "Control Plane (incl. titan-db) CPU",
             node_cpu_expr(CONTROL_ALL_REGEX),
             {"h": 9, "w": 12, "x": 0, "y": 22},
             unit="percent",
@@ -1058,7 +1063,7 @@ def build_nodes_dashboard():
     panels.append(
         timeseries_panel(
             7,
-            "Control plane (incl. titan-db) RAM",
+            "Control Plane (incl. titan-db) RAM",
             node_mem_expr(CONTROL_ALL_REGEX),
             {"h": 9, "w": 12, "x": 12, "y": 22},
             unit="percent",
@@ -1070,7 +1075,7 @@ def build_nodes_dashboard():
     panels.append(
         timeseries_panel(
             8,
-            "Root filesystem usage",
+            "Root Filesystem Usage",
             root_usage_expr(),
             {"h": 9, "w": 24, "x": 0, "y": 31},
             unit="percent",
@@ -1099,7 +1104,7 @@ def build_storage_dashboard():
     panels.append(
         stat_panel(
             1,
-            "Astreae usage",
+            "Astreae Usage",
             astreae_usage_expr("/mnt/astreae"),
             {"h": 5, "w": 6, "x": 0, "y": 0},
             unit="percent",
@@ -1109,7 +1114,7 @@ def build_storage_dashboard():
     panels.append(
         stat_panel(
             2,
-            "Asteria usage",
+            "Asteria Usage",
             astreae_usage_expr("/mnt/asteria"),
             {"h": 5, "w": 6, "x": 6, "y": 0},
             unit="percent",
@@ -1119,7 +1124,7 @@ def build_storage_dashboard():
     panels.append(
         stat_panel(
             3,
-            "Astreae free",
+            "Astreae Free",
             astreae_free_expr("/mnt/astreae"),
             {"h": 5, "w": 6, "x": 12, "y": 0},
             unit="decbytes",
@@ -1128,7 +1133,7 @@ def build_storage_dashboard():
     panels.append(
         stat_panel(
             4,
-            "Asteria free",
+            "Asteria Free",
             astreae_free_expr("/mnt/asteria"),
             {"h": 5, "w": 6, "x": 18, "y": 0},
             unit="decbytes",
@@ -1137,7 +1142,7 @@ def build_storage_dashboard():
     panels.append(
         timeseries_panel(
             5,
-            "Astreae per-node usage",
+            "Astreae Per-Node Usage",
             filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
             {"h": 9, "w": 12, "x": 0, "y": 5},
             unit="percent",
@@ -1150,7 +1155,7 @@ def build_storage_dashboard():
     panels.append(
         timeseries_panel(
             6,
-            "Asteria per-node usage",
+            "Asteria Per-Node Usage",
             filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
             {"h": 9, "w": 12, "x": 12, "y": 5},
             unit="percent",
@@ -1163,7 +1168,7 @@ def build_storage_dashboard():
     panels.append(
         timeseries_panel(
             7,
-            "Astreae usage history",
+            "Astreae Usage History",
             astreae_usage_expr("/mnt/astreae"),
             {"h": 9, "w": 12, "x": 0, "y": 14},
             unit="percent",
@@ -1173,7 +1178,7 @@ def build_storage_dashboard():
     panels.append(
         timeseries_panel(
             8,
-            "Asteria usage history",
+            "Asteria Usage History",
             astreae_usage_expr("/mnt/asteria"),
             {"h": 9, "w": 12, "x": 12, "y": 14},
             unit="percent",
@@ -1199,7 +1204,7 @@ def build_network_dashboard():
     panels.append(
         stat_panel(
             1,
-            "Ingress traffic",
+            "Ingress Traffic",
             NET_INGRESS_EXPR,
             {"h": 4, "w": 8, "x": 0, "y": 0},
             unit="Bps",
@@ -1208,7 +1213,7 @@ def build_network_dashboard():
     panels.append(
         stat_panel(
             2,
-            "Egress traffic",
+            "Egress Traffic",
             NET_EGRESS_EXPR,
             {"h": 4, "w": 8, "x": 8, "y": 0},
             unit="Bps",
@@ -1217,7 +1222,7 @@ def build_network_dashboard():
     panels.append(
         stat_panel(
             3,
-            "Intra-cluster traffic",
+            "Intra-Cluster Traffic",
             NET_INTERNAL_EXPR,
             {"h": 4, "w": 8, "x": 16, "y": 0},
             unit="Bps",
@@ -1226,7 +1231,7 @@ def build_network_dashboard():
     panels.append(
         stat_panel(
             4,
-            "Top router req/s",
+            "Top Router req/s",
             f"topk(1, {TRAEFIK_ROUTER_EXPR})",
             {"h": 4, "w": 8, "x": 0, "y": 4},
             unit="req/s",
@@ -1236,7 +1241,7 @@ def build_network_dashboard():
     panels.append(
         timeseries_panel(
             5,
-            "Per-node throughput",
+            "Per-Node Throughput",
             f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
             {"h": 8, "w": 24, "x": 0, "y": 8},
             unit="Bps",
@@ -1248,7 +1253,7 @@ def build_network_dashboard():
     panels.append(
         table_panel(
             6,
-            "Top namespaces",
+            "Top Namespaces",
             'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
             '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
             {"h": 9, "w": 12, "x": 0, "y": 16},
@@ -1259,7 +1264,7 @@ def build_network_dashboard():
     panels.append(
         table_panel(
             7,
-            "Top pods",
+            "Top Pods",
             'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
             '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
             {"h": 9, "w": 12, "x": 12, "y": 16},
@@ -1270,7 +1275,7 @@ def build_network_dashboard():
     panels.append(
         timeseries_panel(
             8,
-            "Traefik routers (req/s)",
+            "Traefik Routers (req/s)",
             f"topk(10, {TRAEFIK_ROUTER_EXPR})",
             {"h": 9, "w": 12, "x": 0, "y": 25},
             unit="req/s",
@@ -1282,7 +1287,7 @@ def build_network_dashboard():
     panels.append(
         timeseries_panel(
             9,
-            "Traefik entrypoints (req/s)",
+            "Traefik Entrypoints (req/s)",
             'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
             {"h": 9, "w": 12, "x": 12, "y": 25},
             unit="req/s",
@@ -1310,7 +1315,7 @@ def build_gpu_dashboard():
     panels.append(
         pie_panel(
             1,
-            "Namespace GPU share",
+            "Namespace GPU Share",
             namespace_gpu_share_expr(),
             {"h": 8, "w": 12, "x": 0, "y": 0},
         )
@@ -1318,8 +1323,8 @@ def build_gpu_dashboard():
     panels.append(
         timeseries_panel(
             2,
-            "GPU util by namespace",
-            NAMESPACE_GPU_USAGE,
+            "GPU Util by Namespace",
+            NAMESPACE_GPU_USAGE_INSTANT,
             {"h": 8, "w": 12, "x": 12, "y": 0},
             unit="percent",
             legend="{{namespace}}",
@@ -1330,7 +1335,7 @@ def build_gpu_dashboard():
     panels.append(
         timeseries_panel(
             3,
-            "GPU util by node",
+            "GPU Util by Node",
             'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
             {"h": 8, "w": 12, "x": 0, "y": 8},
             unit="percent",
@@ -1342,7 +1347,7 @@ def build_gpu_dashboard():
     panels.append(
         table_panel(
             4,
-            "Top pods by GPU util",
+            "Top Pods by GPU Util",
             'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
             {"h": 8, "w": 12, "x": 12, "y": 8},
             unit="percent",
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index da235a5..8c1367b 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -7,7 +7,7 @@
     {
       "id": 1,
       "type": "piechart",
-      "title": "Namespace GPU share",
+      "title": "Namespace GPU Share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -60,7 +60,7 @@
     {
       "id": 2,
       "type": "timeseries",
-      "title": "GPU util by namespace",
+      "title": "GPU Util by Namespace",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -97,7 +97,7 @@
     {
       "id": 3,
       "type": "timeseries",
-      "title": "GPU util by node",
+      "title": "GPU Util by Node",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -134,7 +134,7 @@
     {
       "id": 4,
       "type": "table",
-      "title": "Top pods by GPU util",
+      "title": "Top Pods by GPU Util",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index f2291b7..ff0af9b 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -7,7 +7,7 @@
     {
       "id": 1,
       "type": "stat",
-      "title": "Ingress traffic",
+      "title": "Ingress Traffic",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -67,7 +67,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Egress traffic",
+      "title": "Egress Traffic",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -127,7 +127,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Intra-cluster traffic",
+      "title": "Intra-Cluster Traffic",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -187,7 +187,7 @@
     {
       "id": 4,
       "type": "stat",
-      "title": "Top router req/s",
+      "title": "Top Router req/s",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -248,7 +248,7 @@
     {
       "id": 5,
       "type": "timeseries",
-      "title": "Per-node throughput",
+      "title": "Per-Node Throughput",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -285,7 +285,7 @@
     {
       "id": 6,
       "type": "table",
-      "title": "Top namespaces",
+      "title": "Top Namespaces",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -321,7 +321,7 @@
     {
       "id": 7,
       "type": "table",
-      "title": "Top pods",
+      "title": "Top Pods",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -357,7 +357,7 @@
     {
       "id": 8,
       "type": "timeseries",
-      "title": "Traefik routers (req/s)",
+      "title": "Traefik Routers (req/s)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -394,7 +394,7 @@
     {
       "id": 9,
       "type": "timeseries",
-      "title": "Traefik entrypoints (req/s)",
+      "title": "Traefik Entrypoints (req/s)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json
index 3cf784f..802fe5a 100644
--- a/services/monitoring/dashboards/atlas-nodes.json
+++ b/services/monitoring/dashboards/atlas-nodes.json
@@ -7,7 +7,7 @@
     {
       "id": 1,
       "type": "stat",
-      "title": "Worker nodes ready",
+      "title": "Worker Nodes Ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -68,7 +68,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Control plane ready",
+      "title": "Control Plane Ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -129,7 +129,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Control plane workloads",
+      "title": "Control Plane Workloads",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -269,7 +269,7 @@
     {
       "id": 6,
       "type": "timeseries",
-      "title": "Control plane (incl. titan-db) CPU",
+      "title": "Control Plane (incl. titan-db) CPU",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -306,7 +306,7 @@
     {
       "id": 7,
       "type": "timeseries",
-      "title": "Control plane (incl. titan-db) RAM",
+      "title": "Control Plane (incl. titan-db) RAM",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -343,7 +343,7 @@
     {
       "id": 8,
       "type": "timeseries",
-      "title": "Root filesystem usage",
+      "title": "Root Filesystem Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 4e3c357..b556594 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1,7 +1,7 @@
 {
   "uid": "atlas-overview",
   "title": "Atlas Overview",
-  "folderUid": "atlas-overview",
+  "folderUid": "overview",
   "editable": false,
   "annotations": {
     "list": []
@@ -10,7 +10,7 @@
     {
       "id": 1,
       "type": "gauge",
-      "title": "Workers ready",
+      "title": "Workers Ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -71,7 +71,7 @@
     {
       "id": 2,
       "type": "gauge",
-      "title": "Control plane ready",
+      "title": "Control Plane Ready",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -124,7 +124,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Control plane workloads",
+      "title": "Control Plane Workloads",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -198,8 +198,8 @@
     },
     {
       "id": 4,
-      "type": "gauge",
-      "title": "Problem pods",
+      "type": "stat",
+      "title": "Problem Pods",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -218,8 +218,10 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "min": 0,
-          "max": 4,
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -240,11 +242,18 @@
                 "value": 3
               }
             ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -252,9 +261,7 @@
           "fields": "",
           "values": false
         },
-        "orientation": "auto",
-        "showThresholdMarkers": false,
-        "showThresholdLabels": false
+        "textMode": "value"
       },
       "links": [
         {
@@ -266,8 +273,8 @@
     },
     {
       "id": 5,
-      "type": "gauge",
-      "title": "Stuck terminating",
+      "type": "stat",
+      "title": "Stuck Terminating",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -286,8 +293,10 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "min": 0,
-          "max": 4,
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -308,11 +317,18 @@
                 "value": 3
               }
             ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -320,9 +336,7 @@
           "fields": "",
           "values": false
         },
-        "orientation": "auto",
-        "showThresholdMarkers": false,
-        "showThresholdLabels": false
+        "textMode": "value"
       },
       "links": [
         {
@@ -619,7 +633,7 @@
     {
       "id": 23,
       "type": "stat",
-      "title": "Astreae usage",
+      "title": "Astreae Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -690,7 +704,7 @@
     {
       "id": 24,
       "type": "stat",
-      "title": "Asteria usage",
+      "title": "Asteria Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -761,7 +775,7 @@
     {
       "id": 25,
       "type": "stat",
-      "title": "Astreae free",
+      "title": "Astreae Free",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -828,7 +842,7 @@
     {
       "id": 26,
       "type": "stat",
-      "title": "Asteria free",
+      "title": "Asteria Free",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -895,7 +909,7 @@
     {
       "id": 11,
       "type": "piechart",
-      "title": "Namespace CPU share",
+      "title": "Namespace CPU Share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -948,7 +962,7 @@
     {
       "id": 12,
       "type": "piechart",
-      "title": "Namespace GPU share",
+      "title": "Namespace GPU Share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -961,7 +975,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1001,7 +1015,7 @@
     {
       "id": 13,
       "type": "piechart",
-      "title": "Namespace RAM share",
+      "title": "Namespace RAM Share",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1054,7 +1068,7 @@
     {
       "id": 14,
       "type": "timeseries",
-      "title": "Worker node CPU",
+      "title": "Worker Node CPU",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1101,7 +1115,7 @@
     {
       "id": 15,
       "type": "timeseries",
-      "title": "Worker node RAM",
+      "title": "Worker Node RAM",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1222,7 +1236,7 @@
     {
       "id": 18,
       "type": "timeseries",
-      "title": "Cluster ingress throughput",
+      "title": "Cluster Ingress Throughput",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1266,7 +1280,7 @@
     {
       "id": 19,
       "type": "timeseries",
-      "title": "Cluster egress throughput",
+      "title": "Cluster Egress Throughput",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1310,7 +1324,7 @@
     {
       "id": 20,
       "type": "timeseries",
-      "title": "Intra-cluster throughput",
+      "title": "Intra-Cluster Throughput",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1354,7 +1368,7 @@
     {
       "id": 21,
       "type": "timeseries",
-      "title": "Root filesystem usage",
+      "title": "Root Filesystem Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1402,7 +1416,7 @@
     {
       "id": 22,
       "type": "bargauge",
-      "title": "Nodes closest to full root disks",
+      "title": "Nodes Closest to Full Root Disks",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1456,7 +1470,7 @@
           "calcs": [
             "lastNotNull"
           ],
-          "fields": "/.*/",
+          "fields": "Value",
           "values": false
         }
       },
@@ -1479,9 +1493,10 @@
     "list": []
   },
   "time": {
-    "from": "now-12h",
+    "from": "now-1h",
     "to": "now"
   },
+  "refresh": "1m",
   "links": [
     {
       "title": "Atlas Pods",
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index f519d14..ef616e0 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -7,7 +7,7 @@
     {
       "id": 1,
       "type": "stat",
-      "title": "Problem pods",
+      "title": "Problem Pods",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -127,7 +127,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Stuck terminating (>10m)",
+      "title": "Stuck Terminating (>10m)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -187,7 +187,7 @@
     {
       "id": 4,
       "type": "stat",
-      "title": "Control plane workloads",
+      "title": "Control Plane Workloads",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -247,7 +247,7 @@
     {
       "id": 5,
       "type": "table",
-      "title": "Pods not running",
+      "title": "Pods Not Running",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json
index 6585794..1d07040 100644
--- a/services/monitoring/dashboards/atlas-storage.json
+++ b/services/monitoring/dashboards/atlas-storage.json
@@ -7,7 +7,7 @@
     {
       "id": 1,
       "type": "stat",
-      "title": "Astreae usage",
+      "title": "Astreae Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -71,7 +71,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Asteria usage",
+      "title": "Asteria Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -135,7 +135,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Astreae free",
+      "title": "Astreae Free",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -195,7 +195,7 @@
     {
       "id": 4,
       "type": "stat",
-      "title": "Asteria free",
+      "title": "Asteria Free",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -255,7 +255,7 @@
     {
       "id": 5,
       "type": "timeseries",
-      "title": "Astreae per-node usage",
+      "title": "Astreae Per-Node Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -293,7 +293,7 @@
     {
       "id": 6,
       "type": "timeseries",
-      "title": "Asteria per-node usage",
+      "title": "Asteria Per-Node Usage",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -331,7 +331,7 @@
     {
       "id": 7,
       "type": "timeseries",
-      "title": "Astreae usage history",
+      "title": "Astreae Usage History",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -368,7 +368,7 @@
     {
       "id": 8,
       "type": "timeseries",
-      "title": "Asteria usage history",
+      "title": "Asteria Usage History",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index 13262d6..1a86c73 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 1,
           "type": "piechart",
-          "title": "Namespace GPU share",
+          "title": "Namespace GPU Share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -69,7 +69,7 @@ data:
         {
           "id": 2,
           "type": "timeseries",
-          "title": "GPU util by namespace",
+          "title": "GPU Util by Namespace",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -106,7 +106,7 @@ data:
         {
           "id": 3,
           "type": "timeseries",
-          "title": "GPU util by node",
+          "title": "GPU Util by Node",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -143,7 +143,7 @@ data:
         {
           "id": 4,
           "type": "table",
-          "title": "Top pods by GPU util",
+          "title": "Top Pods by GPU Util",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index 4b78fb9..fd1f5d6 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 1,
           "type": "stat",
-          "title": "Ingress traffic",
+          "title": "Ingress Traffic",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -76,7 +76,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Egress traffic",
+          "title": "Egress Traffic",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -136,7 +136,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Intra-cluster traffic",
+          "title": "Intra-Cluster Traffic",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -196,7 +196,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Top router req/s",
+          "title": "Top Router req/s",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -257,7 +257,7 @@ data:
         {
           "id": 5,
           "type": "timeseries",
-          "title": "Per-node throughput",
+          "title": "Per-Node Throughput",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -294,7 +294,7 @@ data:
         {
           "id": 6,
           "type": "table",
-          "title": "Top namespaces",
+          "title": "Top Namespaces",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -330,7 +330,7 @@ data:
         {
           "id": 7,
           "type": "table",
-          "title": "Top pods",
+          "title": "Top Pods",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -366,7 +366,7 @@ data:
         {
           "id": 8,
           "type": "timeseries",
-          "title": "Traefik routers (req/s)",
+          "title": "Traefik Routers (req/s)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -403,7 +403,7 @@ data:
         {
           "id": 9,
           "type": "timeseries",
-          "title": "Traefik entrypoints (req/s)",
+          "title": "Traefik Entrypoints (req/s)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml
index c78e994..2facfed 100644
--- a/services/monitoring/grafana-dashboard-nodes.yaml
+++ b/services/monitoring/grafana-dashboard-nodes.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 1,
           "type": "stat",
-          "title": "Worker nodes ready",
+          "title": "Worker Nodes Ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -77,7 +77,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Control plane ready",
+          "title": "Control Plane Ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -138,7 +138,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Control plane workloads",
+          "title": "Control Plane Workloads",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -278,7 +278,7 @@ data:
         {
           "id": 6,
           "type": "timeseries",
-          "title": "Control plane (incl. titan-db) CPU",
+          "title": "Control Plane (incl. titan-db) CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -315,7 +315,7 @@ data:
         {
           "id": 7,
           "type": "timeseries",
-          "title": "Control plane (incl. titan-db) RAM",
+          "title": "Control Plane (incl. titan-db) RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -352,7 +352,7 @@ data:
         {
           "id": 8,
           "type": "timeseries",
-          "title": "Root filesystem usage",
+          "title": "Root Filesystem Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 512adf9..6fbf7c9 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -10,7 +10,7 @@ data:
     {
       "uid": "atlas-overview",
       "title": "Atlas Overview",
-      "folderUid": "atlas-overview",
+      "folderUid": "overview",
       "editable": false,
       "annotations": {
         "list": []
@@ -19,7 +19,7 @@ data:
         {
           "id": 1,
           "type": "gauge",
-          "title": "Workers ready",
+          "title": "Workers Ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -80,7 +80,7 @@ data:
         {
           "id": 2,
           "type": "gauge",
-          "title": "Control plane ready",
+          "title": "Control Plane Ready",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -133,7 +133,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Control plane workloads",
+          "title": "Control Plane Workloads",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -207,8 +207,8 @@ data:
         },
         {
           "id": 4,
-          "type": "gauge",
-          "title": "Problem pods",
+          "type": "stat",
+          "title": "Problem Pods",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -227,8 +227,10 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "min": 0,
-              "max": 4,
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -249,11 +251,18 @@ data:
                     "value": 3
                   }
                 ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -261,9 +270,7 @@ data:
               "fields": "",
               "values": false
             },
-            "orientation": "auto",
-            "showThresholdMarkers": false,
-            "showThresholdLabels": false
+            "textMode": "value"
           },
           "links": [
             {
@@ -275,8 +282,8 @@ data:
         },
         {
           "id": 5,
-          "type": "gauge",
-          "title": "Stuck terminating",
+          "type": "stat",
+          "title": "Stuck Terminating",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -295,8 +302,10 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "min": 0,
-              "max": 4,
+              "color": {
+                "mode": "palette-classic"
+              },
+              "mappings": [],
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -317,11 +326,18 @@ data:
                     "value": 3
                   }
                 ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -329,9 +345,7 @@ data:
               "fields": "",
               "values": false
             },
-            "orientation": "auto",
-            "showThresholdMarkers": false,
-            "showThresholdLabels": false
+            "textMode": "value"
           },
           "links": [
             {
@@ -628,7 +642,7 @@ data:
         {
           "id": 23,
           "type": "stat",
-          "title": "Astreae usage",
+          "title": "Astreae Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -699,7 +713,7 @@ data:
         {
           "id": 24,
           "type": "stat",
-          "title": "Asteria usage",
+          "title": "Asteria Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -770,7 +784,7 @@ data:
         {
           "id": 25,
           "type": "stat",
-          "title": "Astreae free",
+          "title": "Astreae Free",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -837,7 +851,7 @@ data:
         {
           "id": 26,
           "type": "stat",
-          "title": "Asteria free",
+          "title": "Asteria Free",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -904,7 +918,7 @@ data:
         {
           "id": 11,
           "type": "piechart",
-          "title": "Namespace CPU share",
+          "title": "Namespace CPU Share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -957,7 +971,7 @@ data:
         {
           "id": 12,
           "type": "piechart",
-          "title": "Namespace GPU share",
+          "title": "Namespace GPU Share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -970,7 +984,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1010,7 +1024,7 @@ data:
         {
           "id": 13,
           "type": "piechart",
-          "title": "Namespace RAM share",
+          "title": "Namespace RAM Share",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1063,7 +1077,7 @@ data:
         {
           "id": 14,
           "type": "timeseries",
-          "title": "Worker node CPU",
+          "title": "Worker Node CPU",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1110,7 +1124,7 @@ data:
         {
           "id": 15,
           "type": "timeseries",
-          "title": "Worker node RAM",
+          "title": "Worker Node RAM",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1231,7 +1245,7 @@ data:
         {
           "id": 18,
           "type": "timeseries",
-          "title": "Cluster ingress throughput",
+          "title": "Cluster Ingress Throughput",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1275,7 +1289,7 @@ data:
         {
           "id": 19,
           "type": "timeseries",
-          "title": "Cluster egress throughput",
+          "title": "Cluster Egress Throughput",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1319,7 +1333,7 @@ data:
         {
           "id": 20,
           "type": "timeseries",
-          "title": "Intra-cluster throughput",
+          "title": "Intra-Cluster Throughput",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1363,7 +1377,7 @@ data:
         {
           "id": 21,
           "type": "timeseries",
-          "title": "Root filesystem usage",
+          "title": "Root Filesystem Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1411,7 +1425,7 @@ data:
         {
           "id": 22,
           "type": "bargauge",
-          "title": "Nodes closest to full root disks",
+          "title": "Nodes Closest to Full Root Disks",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1465,7 +1479,7 @@ data:
               "calcs": [
                 "lastNotNull"
               ],
-              "fields": "/.*/",
+              "fields": "Value",
               "values": false
             }
           },
@@ -1488,9 +1502,10 @@ data:
         "list": []
       },
       "time": {
-        "from": "now-12h",
+        "from": "now-1h",
         "to": "now"
       },
+      "refresh": "1m",
       "links": [
         {
           "title": "Atlas Pods",
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index 78beca5..f92adf1 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 1,
           "type": "stat",
-          "title": "Problem pods",
+          "title": "Problem Pods",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -136,7 +136,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Stuck terminating (>10m)",
+          "title": "Stuck Terminating (>10m)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -196,7 +196,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Control plane workloads",
+          "title": "Control Plane Workloads",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -256,7 +256,7 @@ data:
         {
           "id": 5,
           "type": "table",
-          "title": "Pods not running",
+          "title": "Pods Not Running",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml
index 1bbf1ea..0a534f2 100644
--- a/services/monitoring/grafana-dashboard-storage.yaml
+++ b/services/monitoring/grafana-dashboard-storage.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 1,
           "type": "stat",
-          "title": "Astreae usage",
+          "title": "Astreae Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -80,7 +80,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Asteria usage",
+          "title": "Asteria Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -144,7 +144,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Astreae free",
+          "title": "Astreae Free",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -204,7 +204,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Asteria free",
+          "title": "Asteria Free",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -264,7 +264,7 @@ data:
         {
           "id": 5,
           "type": "timeseries",
-          "title": "Astreae per-node usage",
+          "title": "Astreae Per-Node Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -302,7 +302,7 @@ data:
         {
           "id": 6,
           "type": "timeseries",
-          "title": "Asteria per-node usage",
+          "title": "Asteria Per-Node Usage",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -340,7 +340,7 @@ data:
         {
           "id": 7,
           "type": "timeseries",
-          "title": "Astreae usage history",
+          "title": "Astreae Usage History",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -377,7 +377,7 @@ data:
         {
           "id": 8,
           "type": "timeseries",
-          "title": "Asteria usage history",
+          "title": "Asteria Usage History",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml
index c52b4e1..54b278f 100644
--- a/services/monitoring/grafana-folders.yaml
+++ b/services/monitoring/grafana-folders.yaml
@@ -10,8 +10,8 @@ data:
   folders.yaml: |
     apiVersion: 1
     folders:
-      - uid: atlas-overview
-        title: Atlas Overview
+      - uid: overview
+        title: Overview
         permissions:
           - role: Viewer
             permission: View
@@ -26,3 +26,10 @@ data:
             permission: View
           - role: Admin
             permission: Admin
+      - uid: oceanus-internal
+        title: Oceanus Internal
+        permissions:
+          - role: Editor
+            permission: View
+          - role: Admin
+            permission: Admin
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index cf56b27..2546dc1 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -256,6 +256,8 @@ spec:
       server:
         domain: metrics.bstein.dev
         root_url: https://metrics.bstein.dev/
+      dashboards:
+        default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json
       auth.anonymous:
         hide_version: true
       users:
@@ -290,7 +292,7 @@ spec:
         providers:
           - name: overview
             orgId: 1
-            folder: Atlas Overview
+            folder: Overview
             type: file
             disableDeletion: false
             editable: false

From 5df94a793736fb474a2026a78d3b6182e9eed1c3 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 14:56:36 -0300
Subject: [PATCH 65/71] monitoring: fix gpu share query and root bar labels

---
 scripts/dashboards_render_atlas.py                  | 4 ++--
 services/monitoring/dashboards/atlas-gpu.json       | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 4 ++--
 services/monitoring/grafana-dashboard-gpu.yaml      | 2 +-
 services/monitoring/grafana-dashboard-overview.yaml | 4 ++--
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 11bd2c8..78e759f 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -232,7 +232,7 @@ NAMESPACE_GPU_ALLOC = (
     ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
 NAMESPACE_GPU_USAGE_SHARE = (
-    'avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)'
+    'sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))'
 )
 NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
 NAMESPACE_GPU_RAW = (
@@ -522,7 +522,7 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
             "orientation": "horizontal",
             "reduceOptions": {
                 "calcs": ["lastNotNull"],
-                "fields": "Value",
+                "fields": "",
                 "values": False,
             },
         },
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index 8c1367b..e67b3d2 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index b556594..8439407 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -975,7 +975,7 @@
       },
       "targets": [
         {
-          "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1470,7 +1470,7 @@
           "calcs": [
             "lastNotNull"
           ],
-          "fields": "Value",
+          "fields": "",
           "values": false
         }
       },
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index 1a86c73..3af8717 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 6fbf7c9..4fcab70 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -984,7 +984,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1479,7 +1479,7 @@ data:
               "calcs": [
                 "lastNotNull"
               ],
-              "fields": "Value",
+              "fields": "",
               "values": false
             }
           },

From b93636ecb9b7bed891b6c47fa6b9029b1ab509db Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 15:12:16 -0300
Subject: [PATCH 66/71] monitoring: shrink hottest node row height

---
 scripts/dashboards_render_atlas.py                  | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 8 ++++----
 services/monitoring/grafana-dashboard-overview.yaml | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 78e759f..dd96c35 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -686,7 +686,7 @@ def build_overview():
                 panel_id,
                 title,
                 f"{expr}",
-                {"h": 5, "w": 6, "x": 6 * idx, "y": 5},
+                {"h": 3, "w": 6, "x": 6 * idx, "y": 5},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 text_mode="name_and_value",
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 8439407..93a7745 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -355,7 +355,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 0,
         "y": 5
@@ -428,7 +428,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 6,
         "y": 5
@@ -501,7 +501,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 12,
         "y": 5
@@ -570,7 +570,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 18,
         "y": 5
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 4fcab70..363e481 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -364,7 +364,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 0,
             "y": 5
@@ -437,7 +437,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 6,
             "y": 5
@@ -510,7 +510,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 12,
             "y": 5
@@ -579,7 +579,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 18,
             "y": 5

From ace383bedd1732e9353cfaf0dca5f70cb28d6b5c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 15:15:21 -0300
Subject: [PATCH 67/71] monitoring: expand worker/control/root rows

---
 scripts/dashboards_render_atlas.py            | 12 +++++------
 .../monitoring/dashboards/atlas-overview.json | 20 +++++++++----------
 .../grafana-dashboard-overview.yaml           | 20 +++++++++----------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index dd96c35..8829ca1 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -746,7 +746,7 @@ def build_overview():
             14,
             "Worker Node CPU",
             node_cpu_expr(worker_filter),
-            {"h": 8, "w": 12, "x": 0, "y": 32},
+            {"h": 12, "w": 12, "x": 0, "y": 32},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -760,7 +760,7 @@ def build_overview():
             15,
             "Worker Node RAM",
             node_mem_expr(worker_filter),
-            {"h": 8, "w": 12, "x": 12, "y": 32},
+            {"h": 12, "w": 12, "x": 12, "y": 32},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -775,7 +775,7 @@ def build_overview():
             16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_REGEX),
-            {"h": 7, "w": 12, "x": 0, "y": 40},
+            {"h": 10, "w": 12, "x": 0, "y": 44},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -787,7 +787,7 @@ def build_overview():
             17,
             "Control plane RAM",
             node_mem_expr(CONTROL_REGEX),
-            {"h": 7, "w": 12, "x": 12, "y": 40},
+            {"h": 10, "w": 12, "x": 12, "y": 44},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -840,7 +840,7 @@ def build_overview():
             21,
             "Root Filesystem Usage",
             root_usage_expr(),
-            {"h": 8, "w": 12, "x": 0, "y": 47},
+            {"h": 16, "w": 12, "x": 0, "y": 54},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -855,7 +855,7 @@ def build_overview():
             22,
             "Nodes Closest to Full Root Disks",
             f"topk(8, {root_usage_expr()})",
-            {"h": 8, "w": 12, "x": 12, "y": 47},
+            {"h": 16, "w": 12, "x": 12, "y": 54},
             unit="percent",
             links=link_to("atlas-storage"),
         )
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 93a7745..9800f1c 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1074,7 +1074,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 12,
         "w": 12,
         "x": 0,
         "y": 32
@@ -1121,7 +1121,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 12,
         "w": 12,
         "x": 12,
         "y": 32
@@ -1168,10 +1168,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 7,
+        "h": 10,
         "w": 12,
         "x": 0,
-        "y": 40
+        "y": 44
       },
       "targets": [
         {
@@ -1205,10 +1205,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 7,
+        "h": 10,
         "w": 12,
         "x": 12,
-        "y": 40
+        "y": 44
       },
       "targets": [
         {
@@ -1374,10 +1374,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 16,
         "w": 12,
         "x": 0,
-        "y": 47
+        "y": 54
       },
       "targets": [
         {
@@ -1422,10 +1422,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 16,
         "w": 12,
         "x": 12,
-        "y": 47
+        "y": 54
       },
       "targets": [
         {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 363e481..7b91758 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1083,7 +1083,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 12,
             "w": 12,
             "x": 0,
             "y": 32
@@ -1130,7 +1130,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 12,
             "w": 12,
             "x": 12,
             "y": 32
@@ -1177,10 +1177,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 10,
             "w": 12,
             "x": 0,
-            "y": 40
+            "y": 44
           },
           "targets": [
             {
@@ -1214,10 +1214,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 7,
+            "h": 10,
             "w": 12,
             "x": 12,
-            "y": 40
+            "y": 44
           },
           "targets": [
             {
@@ -1383,10 +1383,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 16,
             "w": 12,
             "x": 0,
-            "y": 47
+            "y": 54
           },
           "targets": [
             {
@@ -1431,10 +1431,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 16,
             "w": 12,
             "x": 12,
-            "y": 47
+            "y": 54
           },
           "targets": [
             {

From 6eba26b359f3989c1fdfef1c02cacc9969cd634d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 15:21:02 -0300
Subject: [PATCH 68/71] monitoring: show top12 root disks

---
 scripts/dashboards_render_atlas.py                  | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 2 +-
 services/monitoring/grafana-dashboard-overview.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 8829ca1..93de006 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -854,7 +854,7 @@ def build_overview():
         bargauge_panel(
             22,
             "Nodes Closest to Full Root Disks",
-            f"topk(8, {root_usage_expr()})",
+            f"topk(12, {root_usage_expr()})",
             {"h": 16, "w": 12, "x": 12, "y": 54},
             unit="percent",
             links=link_to("atlas-storage"),
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 9800f1c..9eda81d 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1429,7 +1429,7 @@
       },
       "targets": [
         {
-          "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 7b91758..928098e 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1438,7 +1438,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }

From 839fb94836dec85164c3cf680e00732707ea86bf Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 17:01:32 -0300
Subject: [PATCH 69/71] notes: update monitoring and next steps

---
 AGENTS.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 05838aa..d660e75 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -40,3 +40,14 @@ Repository Guidelines
 - Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie.
 - Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned.
 - Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview.
+
+## Monitoring state (2025-12-03)
+- dcgm-exporter DaemonSet pulls `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04` with nvidia runtime/imagePullSecret; titan-24 exports metrics, titan-22 remains NotReady.
+- Atlas Overview is the Grafana home (1h range, 1m refresh), Overview folder UID `overview`, internal folder `atlas-internal` (oceanus-internal stub).
+- Panels standardized via generator; hottest row compressed, worker/control rows taller, root disk row taller and top12 bar gauge with labels. GPU share pie uses 1h avg_over_time to persist idle activity.
+- Internal dashboards are provisioned without Viewer role; if anonymous still sees them, restart Grafana and tighten auth if needed.
+
+## Upcoming priorities (SSO/storage/mail)
+- Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe.
+- Add Nextcloud (limit to rpi5 workers) with office suite; integrate with SSO; plan storage class and ingress.
+- Plan mail: mostly self-hosted, relay through trusted provider for outbound; integrate with services (Nextcloud, Vaultwarden, etc.) for notifications and account flows.

From 762aa7bb0f57cabebff4b61f8a1cd09fa495cd9e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 17:14:45 -0300
Subject: [PATCH 70/71] notes: add sso plan sketch

---
 AGENTS.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index d660e75..664fc6c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -51,3 +51,14 @@ Repository Guidelines
 - Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe.
 - Add Nextcloud (limit to rpi5 workers) with office suite; integrate with SSO; plan storage class and ingress.
 - Plan mail: mostly self-hosted, relay through trusted provider for outbound; integrate with services (Nextcloud, Vaultwarden, etc.) for notifications and account flows.
+
+## SSO plan sketch (2025-12-03)
+- IdP: use Keycloak (preferred) in a new `sso` namespace, Bitnami or codecentric chart with Postgres backing store (single PVC), ingress `sso.bstein.dev`, admin user bound to brad@bstein.dev; stick with local DB initially (no external IdP).
+- Auth flow goals: Grafana (OIDC), Gitea (OAuth2/Keycloak), Zot (via Traefik forward-auth/oauth2-proxy), Jellyfin/Pegasus via Jellyfin OAuth/OpenID plugin (map existing usernames; run migration to pre-create users in Keycloak with same usernames/emails and temporary passwords), Pegasus keeps using Jellyfin tokens.
+- Steps to implement:
+  1) Add service folder `services/keycloak/` (namespace, PVC, HelmRelease, ingress, secret for admin creds). Verify with kustomize + Flux reconcile.
+  2) Seed realm `atlas` with users (import CSV/realm). Create client for Grafana (public/implicit), Gitea (confidential), and a “jellyfin” client for the OAuth plugin; set email for brad@bstein.dev as admin.
+  3) Reconfigure Grafana to OIDC (disable anonymous to internal folders, leave Overview public via folder permissions). Reconfigure Gitea to OIDC (app.ini).
+  4) Add Traefik forward-auth (oauth2-proxy) in front of Zot and any other services needing headers-based auth.
+  5) Deploy Jellyfin OpenID plugin; map Keycloak users to existing Jellyfin usernames; communicate password reset path.
+- Migration caution: do not delete existing local creds until SSO validated; keep Pegasus working via Jellyfin tokens during transition.

From e80505a7730ea1847011a9e9130445b53721f01d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 17:36:37 -0300
Subject: [PATCH 71/71] notes: add postgres centralization guidance

---
 AGENTS.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 664fc6c..a8d49c8 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -62,3 +62,7 @@ Repository Guidelines
   4) Add Traefik forward-auth (oauth2-proxy) in front of Zot and any other services needing headers-based auth.
   5) Deploy Jellyfin OpenID plugin; map Keycloak users to existing Jellyfin usernames; communicate password reset path.
 - Migration caution: do not delete existing local creds until SSO validated; keep Pegasus working via Jellyfin tokens during transition.
+
+## Postgres centralization (2025-12-03)
+- Prefer a shared in-cluster Postgres deployment with per-service databases to reduce resource sprawl on Pi nodes. Use it for services that can easily point at an external DB.
+- Candidates to migrate to shared Postgres: Keycloak (realm DB), Gitea (git DB), Nextcloud (app DB), possibly Grafana (if persistence needed beyond current provisioner), Jitsi prosody/JVB state (if external DB supported). Keep tightly-coupled or lightweight embedded DBs as-is when migration is painful or not supported.