diff --git a/.gitignore b/.gitignore
index 8d0ab1e..7543bbf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ __pycache__/
 *.py[cod]
 .pytest_cache
 .venv
+.venv-ci
 tmp/
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000..4d6b23e
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,77 @@
+// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
+pipeline {
+  agent {
+    kubernetes {
+      defaultContainer 'python'
+      yaml """
+apiVersion: v1
+kind: Pod
+spec:
+  nodeSelector:
+    hardware: rpi5
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: "true"
+  containers:
+    - name: python
+      image: python:3.12-slim
+      command:
+        - cat
+      tty: true
+"""
+    }
+  }
+  environment {
+    PIP_DISABLE_PIP_VERSION_CHECK = '1'
+    PYTHONUNBUFFERED = '1'
+  }
+  stages {
+    stage('Checkout') {
+      steps {
+        checkout scm
+      }
+    }
+    stage('Install deps') {
+      steps {
+        sh 'pip install --no-cache-dir -r ci/requirements.txt'
+      }
+    }
+    stage('Glue tests') {
+      steps {
+        sh 'pytest -q ci/tests/glue'
+      }
+    }
+    stage('Resolve Flux branch') {
+      steps {
+        script {
+          env.FLUX_BRANCH = sh(
+            returnStdout: true,
+            script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
+          ).trim()
+          if (!env.FLUX_BRANCH) {
+            error('Flux branch not found in gotk-sync.yaml')
+          }
+          echo "Flux branch: ${env.FLUX_BRANCH}"
+        }
+      }
+    }
+    stage('Promote') {
+      when {
+        expression {
+          def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
+          return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
+        }
+      }
+      steps {
+        withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
+          sh '''
+            set +x
+            git config user.email "jenkins@bstein.dev"
+            git config user.name "jenkins"
+            git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
+            git push origin HEAD:${FLUX_BRANCH}
+          '''
+        }
+      }
+    }
+  }
+}
diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac
index 3b13eb0..77990d7 100644
--- a/ci/Jenkinsfile.titan-iac
+++ b/ci/Jenkinsfile.titan-iac
@@ -6,6 +6,10 @@ pipeline {
 apiVersion: v1
 kind: Pod
 spec:
+  nodeSelector:
+    hardware: rpi5
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: "true"
   containers:
     - name: python
       image: python:3.12-slim
@@ -18,7 +22,6 @@ spec:
   environment {
     PIP_DISABLE_PIP_VERSION_CHECK = '1'
     PYTHONUNBUFFERED = '1'
-    DEPLOY_BRANCH = 'deploy'
   }
   stages {
     stage('Checkout') {
@@ -36,7 +39,27 @@ spec:
         sh 'pytest -q ci/tests/glue'
       }
     }
+    stage('Resolve Flux branch') {
+      steps {
+        script {
+          env.FLUX_BRANCH = sh(
+            returnStdout: true,
+            script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
+          ).trim()
+          if (!env.FLUX_BRANCH) {
+            error('Flux branch not found in gotk-sync.yaml')
+          }
+          echo "Flux branch: ${env.FLUX_BRANCH}"
+        }
+      }
+    }
     stage('Promote') {
+      when {
+        expression {
+          def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
+          return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
+        }
+      }
       steps {
         withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
           sh '''
@@ -44,7 +67,7 @@ spec:
             git config user.email "jenkins@bstein.dev"
             git config user.name "jenkins"
             git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
-            git push origin HEAD:${DEPLOY_BRANCH}
+            git push origin HEAD:${FLUX_BRANCH}
           '''
         }
       }
diff --git a/ci/tests/glue/config.yaml b/ci/tests/glue/config.yaml
index 8adf4ca..16b656c 100644
--- a/ci/tests/glue/config.yaml
+++ b/ci/tests/glue/config.yaml
@@ -1,7 +1,16 @@
 max_success_age_hours: 48
 allow_suspended:
+  - bstein-dev-home/vaultwarden-cred-sync
   - comms/othrys-room-reset
   - comms/pin-othrys-invite
   - comms/seed-othrys-room
   - finance/firefly-user-sync
+  - health/wger-admin-ensure
   - health/wger-user-sync
+  - mailu-mailserver/mailu-sync-nightly
+  - nextcloud/nextcloud-mail-sync
+ariadne_schedule_tasks:
+  - schedule.mailu_sync
+  - schedule.nextcloud_sync
+  - schedule.vaultwarden_sync
+  - schedule.wger_admin
diff --git a/ci/tests/glue/test_glue_metrics.py b/ci/tests/glue/test_glue_metrics.py
index 16b01c7..52ec0be 100644
--- a/ci/tests/glue/test_glue_metrics.py
+++ b/ci/tests/glue/test_glue_metrics.py
@@ -1,11 +1,19 @@
 from __future__ import annotations
 
 import os
+from pathlib import Path
 
 import requests
+import yaml
 
 
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
+CONFIG_PATH = Path(__file__).with_name("config.yaml")
+
+
+def _load_config() -> dict:
+    with CONFIG_PATH.open("r", encoding="utf-8") as handle:
+        return yaml.safe_load(handle) or {}
 
 
 def _query(promql: str) -> list[dict]:
@@ -27,3 +35,14 @@ def test_glue_metrics_success_join():
     )
     series = _query(query)
     assert series, "No glue cronjob last success series found"
+
+
+def test_ariadne_schedule_metrics_present():
+    cfg = _load_config()
+    expected = cfg.get("ariadne_schedule_tasks", [])
+    if not expected:
+        return
+    series = _query("ariadne_schedule_next_run_timestamp_seconds")
+    tasks = {item.get("metric", {}).get("task") for item in series}
+    missing = [task for task in expected if task not in tasks]
+    assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
new file mode 100644
index 0000000..ff97f73
--- /dev/null
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
@@ -0,0 +1,17 @@
+# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: bstein-dev-home-migrations
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/bstein-dev-home/oneoffs/migrations
+  prune: true
+  force: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  targetNamespace: bstein-dev-home
+  wait: false
+  suspend: true
diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
index 88dda40..f1d41be 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
   name: bstein-dev-home
-  namespace: flux-system
+  namespace: bstein-dev-home
 spec:
   interval: 1m0s
   sourceRef:
@@ -13,14 +13,14 @@ spec:
   git:
     checkout:
       ref:
-        branch: feature/vault-consumption
+        branch: feature/ariadne
     commit:
       author:
         email: ops@bstein.dev
         name: flux-bot
-      messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
+      messageTemplate: "chore(bstein-dev-home): automated image update"
     push:
-      branch: feature/vault-consumption
+      branch: feature/ariadne
   update:
     strategy: Setters
     path: services/bstein-dev-home
diff --git a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
index 06baf26..5eec32f 100644
--- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
@@ -13,11 +13,6 @@ spec:
     kind: GitRepository
     name: flux-system
     namespace: flux-system
-  healthChecks:
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
-      name: harbor
-      namespace: harbor
   wait: false
   dependsOn:
     - name: core
diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml
index 417a3ec..10c203d 100644
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@@ -12,6 +12,7 @@ resources:
   - pegasus/image-automation.yaml
   - bstein-dev-home/kustomization.yaml
   - bstein-dev-home/image-automation.yaml
+  - bstein-dev-home-migrations/kustomization.yaml
   - harbor/kustomization.yaml
   - harbor/image-automation.yaml
   - jellyfin/kustomization.yaml
diff --git a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
index ec0494e..d11422a 100644
--- a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
@@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
   name: pegasus
-  namespace: flux-system
+  namespace: jellyfin
 spec:
   interval: 1m0s
   sourceRef:
diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml
index b689cc0..6e75b04 100644
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@@ -11,6 +11,7 @@ resources:
   - monitoring/kustomization.yaml
   - logging/kustomization.yaml
   - maintenance/kustomization.yaml
+  - maintenance/image-automation.yaml
   - longhorn-adopt/kustomization.yaml
   - longhorn/kustomization.yaml
   - longhorn-ui/kustomization.yaml
diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
new file mode 100644
index 0000000..6e8f612
--- /dev/null
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@@ -0,0 +1,26 @@
+# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageUpdateAutomation
+metadata:
+  name: maintenance
+  namespace: maintenance
+spec:
+  interval: 1m0s
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  git:
+    checkout:
+      ref:
+        branch: feature/ariadne
+    commit:
+      author:
+        email: ops@bstein.dev
+        name: flux-bot
+      messageTemplate: "chore(maintenance): automated image update"
+    push:
+      branch: feature/ariadne
+  update:
+    strategy: Setters
+    path: services/maintenance
diff --git a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
index fc655a4..8477ec9 100644
--- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
@@ -8,6 +8,7 @@ spec:
   interval: 10m
   path: ./services/maintenance
   prune: true
+  force: true
   sourceRef:
     kind: GitRepository
     name: flux-system
diff --git a/infrastructure/core/coredns-custom.yaml b/infrastructure/core/coredns-custom.yaml
index 8aeff14..6266a22 100644
--- a/infrastructure/core/coredns-custom.yaml
+++ b/infrastructure/core/coredns-custom.yaml
@@ -32,6 +32,9 @@ data:
         192.168.22.9 notes.bstein.dev
         192.168.22.9 office.bstein.dev
         192.168.22.9 pegasus.bstein.dev
+        3.136.224.193 pm-bounces.bstein.dev
+        3.150.68.49 pm-bounces.bstein.dev
+        18.189.137.81 pm-bounces.bstein.dev
         192.168.22.9 registry.bstein.dev
         192.168.22.9 scm.bstein.dev
         192.168.22.9 secret.bstein.dev
diff --git a/infrastructure/core/kustomization.yaml b/infrastructure/core/kustomization.yaml
index 6286186..257e1f0 100644
--- a/infrastructure/core/kustomization.yaml
+++ b/infrastructure/core/kustomization.yaml
@@ -6,5 +6,6 @@ resources:
   - ../modules/profiles/atlas-ha
   - coredns-custom.yaml
   - coredns-deployment.yaml
+  - ntp-sync-daemonset.yaml
   - ../sources/cert-manager/letsencrypt.yaml
   - ../sources/cert-manager/letsencrypt-prod.yaml
diff --git a/infrastructure/core/ntp-sync-daemonset.yaml b/infrastructure/core/ntp-sync-daemonset.yaml
new file mode 100644
index 0000000..ba97294
--- /dev/null
+++ b/infrastructure/core/ntp-sync-daemonset.yaml
@@ -0,0 +1,50 @@
+# infrastructure/core/ntp-sync-daemonset.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: ntp-sync
+  namespace: kube-system
+  labels:
+    app: ntp-sync
+spec:
+  selector:
+    matchLabels:
+      app: ntp-sync
+  template:
+    metadata:
+      labels:
+        app: ntp-sync
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/control-plane
+                    operator: DoesNotExist
+                  - key: node-role.kubernetes.io/master
+                    operator: DoesNotExist
+      containers:
+        - name: ntp-sync
+          image: public.ecr.aws/docker/library/busybox:1.36.1
+          imagePullPolicy: IfNotPresent
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -eu
+              while true; do
+                ntpd -q -p pool.ntp.org || true
+                sleep 300
+              done
+          securityContext:
+            capabilities:
+              add: ["SYS_TIME"]
+            runAsUser: 0
+            runAsGroup: 0
+          resources:
+            requests:
+              cpu: 10m
+              memory: 16Mi
+            limits:
+              cpu: 50m
+              memory: 64Mi
diff --git a/infrastructure/longhorn/core/secretproviderclass.yaml b/infrastructure/longhorn/core/secretproviderclass.yaml
index 031d1d8..e292b86 100644
--- a/infrastructure/longhorn/core/secretproviderclass.yaml
+++ b/infrastructure/longhorn/core/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "longhorn"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/longhorn"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: longhorn-registry
diff --git a/infrastructure/postgres/service.yaml b/infrastructure/postgres/service.yaml
index 3dcab3c..b695045 100644
--- a/infrastructure/postgres/service.yaml
+++ b/infrastructure/postgres/service.yaml
@@ -4,6 +4,10 @@ kind: Service
 metadata:
   name: postgres-service
   namespace: postgres
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "9187"
+    prometheus.io/path: "/metrics"
 spec:
   clusterIP: None
   ports:
@@ -11,5 +15,9 @@ spec:
       port: 5432
       protocol: TCP
       targetPort: 5432
+    - name: metrics
+      port: 9187
+      protocol: TCP
+      targetPort: 9187
   selector:
     app: postgres
diff --git a/infrastructure/postgres/statefulset.yaml b/infrastructure/postgres/statefulset.yaml
index e1a1921..2c79248 100644
--- a/infrastructure/postgres/statefulset.yaml
+++ b/infrastructure/postgres/statefulset.yaml
@@ -58,6 +58,23 @@ spec:
             - name: vault-secrets
               mountPath: /mnt/vault
               readOnly: true
+        - name: postgres-exporter
+          image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
+          ports:
+            - name: metrics
+              containerPort: 9187
+              protocol: TCP
+          env:
+            - name: DATA_SOURCE_URI
+              value: "localhost:5432/postgres?sslmode=disable"
+            - name: DATA_SOURCE_USER
+              value: postgres
+            - name: DATA_SOURCE_PASS_FILE
+              value: /mnt/vault/postgres_password
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /mnt/vault
+              readOnly: true
       volumes:
         - name: vault-secrets
           csi:
diff --git a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
index 7f90f01..5795b09 100644
--- a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
+++ b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
@@ -5,7 +5,7 @@ metadata:
   name: letsencrypt-prod
 spec:
   acme:
-    email: brad.stein@gmail.com
+    email: brad@bstein.dev
     server: https://acme-v02.api.letsencrypt.org/directory
     privateKeySecretRef:
       name: letsencrypt-prod-account-key
diff --git a/infrastructure/sources/cert-manager/letsencrypt.yaml b/infrastructure/sources/cert-manager/letsencrypt.yaml
index a988312..5fbe4e3 100644
--- a/infrastructure/sources/cert-manager/letsencrypt.yaml
+++ b/infrastructure/sources/cert-manager/letsencrypt.yaml
@@ -5,7 +5,7 @@ metadata:
   name: letsencrypt
 spec:
   acme:
-    email: brad.stein@gmail.com
+    email: brad@bstein.dev
     server: https://acme-v02.api.letsencrypt.org/directory
     privateKeySecretRef:
       name: letsencrypt-account-key
diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
index 0b249fc..0004c0d 100644
--- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml
+++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
@@ -17,4 +17,5 @@ spec:
   values:
     syncSecret:
       enabled: true
-    enableSecretRotation: false
+    enableSecretRotation: true
+    rotationPollInterval: 2m
diff --git a/knowledge/catalog/atlas-summary.json b/knowledge/catalog/atlas-summary.json
index fa35051..ea825ce 100644
--- a/knowledge/catalog/atlas-summary.json
+++ b/knowledge/catalog/atlas-summary.json
@@ -1,8 +1,8 @@
 {
   "counts": {
-    "helmrelease_host_hints": 17,
-    "http_endpoints": 37,
-    "services": 43,
-    "workloads": 54
+    "helmrelease_host_hints": 19,
+    "http_endpoints": 45,
+    "services": 47,
+    "workloads": 74
   }
 }
diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json
index 0d97bcd..951c807 100644
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
@@ -11,6 +11,21 @@
       "path": "services/bstein-dev-home",
       "targetNamespace": "bstein-dev-home"
     },
+    {
+      "name": "bstein-dev-home-migrations",
+      "path": "services/bstein-dev-home/migrations",
+      "targetNamespace": "bstein-dev-home"
+    },
+    {
+      "name": "cert-manager",
+      "path": "infrastructure/cert-manager",
+      "targetNamespace": "cert-manager"
+    },
+    {
+      "name": "cert-manager-cleanup",
+      "path": "infrastructure/cert-manager/cleanup",
+      "targetNamespace": "cert-manager"
+    },
     {
       "name": "comms",
       "path": "services/comms",
@@ -26,6 +41,11 @@
       "path": "services/crypto",
       "targetNamespace": "crypto"
     },
+    {
+      "name": "finance",
+      "path": "services/finance",
+      "targetNamespace": "finance"
+    },
     {
       "name": "flux-system",
       "path": "clusters/atlas/flux-system",
@@ -46,6 +66,11 @@
       "path": "services/harbor",
       "targetNamespace": "harbor"
     },
+    {
+      "name": "health",
+      "path": "services/health",
+      "targetNamespace": "health"
+    },
     {
       "name": "helm",
       "path": "infrastructure/sources/helm",
@@ -71,6 +96,16 @@
       "path": "services/logging",
       "targetNamespace": null
     },
+    {
+      "name": "longhorn",
+      "path": "infrastructure/longhorn/core",
+      "targetNamespace": "longhorn-system"
+    },
+    {
+      "name": "longhorn-adopt",
+      "path": "infrastructure/longhorn/adopt",
+      "targetNamespace": "longhorn-system"
+    },
     {
       "name": "longhorn-ui",
       "path": "infrastructure/longhorn/ui-ingress",
@@ -161,11 +196,21 @@
       "path": "infrastructure/vault-csi",
       "targetNamespace": "kube-system"
     },
+    {
+      "name": "vault-injector",
+      "path": "infrastructure/vault-injector",
+      "targetNamespace": "vault"
+    },
     {
       "name": "vaultwarden",
       "path": "services/vaultwarden",
       "targetNamespace": "vaultwarden"
     },
+    {
+      "name": "wallet-monero-temp",
+      "path": "services/crypto/wallet-monero-temp",
+      "targetNamespace": "crypto"
+    },
     {
       "name": "xmr-miner",
       "path": "services/crypto/xmr-miner",
@@ -199,7 +244,7 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157"
       ]
     },
     {
@@ -215,7 +260,20 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "bstein-dev-home",
+      "name": "bstein-dev-home-vault-sync",
+      "labels": {
+        "app": "bstein-dev-home-vault-sync"
+      },
+      "serviceAccountName": "bstein-dev-home-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -225,7 +283,7 @@
       "labels": {
         "app": "chat-ai-gateway"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "bstein-dev-home",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -249,6 +307,19 @@
         "python:3.11-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "comms",
+      "name": "comms-vault-sync",
+      "labels": {
+        "app": "comms-vault-sync"
+      },
+      "serviceAccountName": "comms-vault",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "comms",
@@ -256,7 +327,7 @@
       "labels": {
         "app": "coturn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -286,7 +357,7 @@
       "labels": {
         "app": "livekit"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -301,12 +372,12 @@
       "labels": {
         "app": "livekit-token-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
       "images": [
-        "ghcr.io/element-hq/lk-jwt-service:0.3.0"
+        "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0"
       ]
     },
     {
@@ -316,7 +387,7 @@
       "labels": {
         "app": "matrix-authentication-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -331,7 +402,7 @@
       "labels": {
         "app.kubernetes.io/name": "matrix-guest-register"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {},
       "images": [
         "python:3.11-slim"
@@ -365,6 +436,19 @@
         "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "crypto-vault-sync",
+      "labels": {
+        "app": "crypto-vault-sync"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "crypto",
@@ -372,7 +456,7 @@
       "labels": {
         "app": "monero-p2pool"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "crypto-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -395,6 +479,53 @@
         "registry.bstein.dev/crypto/monerod:0.18.4.1"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "labels": {
+        "app": "wallet-monero-temp"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "actual-budget",
+      "labels": {
+        "app": "actual-budget"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "firefly",
+      "labels": {
+        "app": "firefly"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "fireflyiii/core:version-6.4.15"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "flux-system",
@@ -516,7 +647,7 @@
       "labels": {
         "app": "gitea"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "gitea-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -524,6 +655,36 @@
         "gitea/gitea:1.23"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "harbor",
+      "name": "harbor-vault-sync",
+      "labels": {
+        "app": "harbor-vault-sync"
+      },
+      "serviceAccountName": "harbor-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "health",
+      "name": "wger",
+      "labels": {
+        "app": "wger"
+      },
+      "serviceAccountName": "health-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10",
+        "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "jellyfin",
@@ -531,7 +692,7 @@
       "labels": {
         "app": "jellyfin"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {},
       "images": [
         "docker.io/jellyfin/jellyfin:10.11.5"
@@ -544,14 +705,27 @@
       "labels": {
         "app": "pegasus"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
         "alpine:3.20",
-        "registry.bstein.dev/streaming/pegasus:1.2.32"
+        "registry.bstein.dev/streaming/pegasus-vault:1.2.32"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "jellyfin",
+      "name": "pegasus-vault-sync",
+      "labels": {
+        "app": "pegasus-vault-sync"
+      },
+      "serviceAccountName": "pegasus-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -570,6 +744,35 @@
         "jenkins/jenkins:2.528.3-jdk21"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "jenkins",
+      "name": "jenkins-vault-sync",
+      "labels": {
+        "app": "jenkins-vault-sync"
+      },
+      "serviceAccountName": "jenkins-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "kube-system",
+      "name": "ntp-sync",
+      "labels": {
+        "app": "ntp-sync"
+      },
+      "serviceAccountName": null,
+      "nodeSelector": {},
+      "images": [
+        "public.ecr.aws/docker/library/busybox:1.36.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "kube-system",
@@ -636,6 +839,21 @@
         "hashicorp/vault-csi-provider:1.7.0"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "kube-system",
+      "name": "coredns",
+      "labels": {
+        "k8s-app": "kube-dns"
+      },
+      "serviceAccountName": "coredns",
+      "nodeSelector": {
+        "kubernetes.io/os": "linux"
+      },
+      "images": [
+        "registry.bstein.dev/infra/coredns:1.12.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "logging",
@@ -681,6 +899,19 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "logging",
+      "name": "logging-vault-sync",
+      "labels": {
+        "app": "logging-vault-sync"
+      },
+      "serviceAccountName": "logging-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "logging",
@@ -688,12 +919,27 @@
       "labels": {
         "app": "oauth2-proxy-logs"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "logging-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "longhorn-system",
+      "name": "longhorn-vault-sync",
+      "labels": {
+        "app": "longhorn-vault-sync"
+      },
+      "serviceAccountName": "longhorn-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -703,7 +949,7 @@
       "labels": {
         "app": "oauth2-proxy-longhorn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "longhorn-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -729,14 +975,45 @@
     {
       "kind": "Deployment",
       "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "name": "mailu-vault-sync",
       "labels": {
-        "app": "mailu-sync-listener"
+        "app": "mailu-vault-sync"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "mailu-vault-sync",
       "nodeSelector": {},
       "images": [
-        "python:3.11-alpine"
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "disable-k3s-traefik",
+      "labels": {
+        "app": "disable-k3s-traefik"
+      },
+      "serviceAccountName": "disable-k3s-traefik",
+      "nodeSelector": {
+        "node-role.kubernetes.io/control-plane": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "k3s-agent-restart",
+      "labels": {
+        "app": "k3s-agent-restart"
+      },
+      "serviceAccountName": "node-nofile",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
     {
@@ -767,6 +1044,35 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "ariadne",
+      "labels": {
+        "app": "ariadne"
+      },
+      "serviceAccountName": "ariadne",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/bstein/ariadne:0.1.0-49"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "maintenance-vault-sync",
+      "labels": {
+        "app": "maintenance-vault-sync"
+      },
+      "serviceAccountName": "maintenance-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "monitoring",
@@ -795,6 +1101,19 @@
         "python:3.10-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "monitoring",
+      "name": "monitoring-vault-sync",
+      "labels": {
+        "app": "monitoring-vault-sync"
+      },
+      "serviceAccountName": "monitoring-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "monitoring",
@@ -802,7 +1121,7 @@
       "labels": {
         "app": "postmark-exporter"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "monitoring-vault-sync",
       "nodeSelector": {},
       "images": [
         "python:3.12-alpine"
@@ -830,7 +1149,7 @@
       "labels": {
         "app": "nextcloud"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "nextcloud-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -845,7 +1164,7 @@
       "labels": {
         "app": "outline"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "outline-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -875,7 +1194,7 @@
       "labels": {
         "app": "planka"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "planka-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -895,7 +1214,8 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "postgres:15"
+        "postgres:15",
+        "quay.io/prometheuscommunity/postgres-exporter:v0.15.0"
       ]
     },
     {
@@ -905,8 +1225,11 @@
       "labels": {
         "app": "keycloak"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "sso-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
         "quay.io/keycloak/keycloak:26.0.7"
       ]
@@ -918,12 +1241,25 @@
       "labels": {
         "app": "oauth2-proxy"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "sso",
+      "name": "sso-vault-sync",
+      "labels": {
+        "app": "sso-vault-sync"
+      },
+      "serviceAccountName": "sso-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -933,7 +1269,7 @@
       "labels": {
         "app": "openldap"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -951,7 +1287,7 @@
       },
       "serviceAccountName": "sui-metrics",
       "nodeSelector": {
-        "kubernetes.io/hostname": "titan-24"
+        "hardware": "rpi5"
       },
       "images": [
         "victoriametrics/vmagent:v1.103.0"
@@ -962,7 +1298,9 @@
       "namespace": "traefik",
       "name": "traefik",
       "labels": {
-        "app": "traefik"
+        "app": "traefik",
+        "app.kubernetes.io/instance": "traefik-kube-system",
+        "app.kubernetes.io/name": "traefik"
       },
       "serviceAccountName": "traefik-ingress-controller",
       "nodeSelector": {
@@ -995,10 +1333,13 @@
       "labels": {
         "app": "vaultwarden"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "vaultwarden-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
-        "vaultwarden/server:1.33.2"
+        "vaultwarden/server:1.35.2"
       ]
     }
   ],
@@ -1565,6 +1906,54 @@
         }
       ]
     },
+    {
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wallet-monero-temp"
+      },
+      "ports": [
+        {
+          "name": "rpc",
+          "port": 18083,
+          "targetPort": 18083,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "actual-budget",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "actual-budget"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 5006,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "firefly",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "firefly"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 8080,
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "flux-system",
       "name": "notification-controller",
@@ -1632,7 +2021,7 @@
     {
       "namespace": "gitea",
       "name": "gitea-ssh",
-      "type": "NodePort",
+      "type": "LoadBalancer",
       "selector": {
         "app": "gitea"
       },
@@ -1645,6 +2034,22 @@
         }
       ]
     },
+    {
+      "namespace": "health",
+      "name": "wger",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wger"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": "http",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "jellyfin",
       "name": "jellyfin",
@@ -1699,29 +2104,6 @@
         }
       ]
     },
-    {
-      "namespace": "kube-system",
-      "name": "traefik",
-      "type": "LoadBalancer",
-      "selector": {
-        "app.kubernetes.io/instance": "traefik-kube-system",
-        "app.kubernetes.io/name": "traefik"
-      },
-      "ports": [
-        {
-          "name": "web",
-          "port": 80,
-          "targetPort": "web",
-          "protocol": "TCP"
-        },
-        {
-          "name": "websecure",
-          "port": 443,
-          "targetPort": "websecure",
-          "protocol": "TCP"
-        }
-      ]
-    },
     {
       "namespace": "logging",
       "name": "oauth2-proxy-logs",
@@ -1803,17 +2185,17 @@
       ]
     },
     {
-      "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "namespace": "maintenance",
+      "name": "ariadne",
       "type": "ClusterIP",
       "selector": {
-        "app": "mailu-sync-listener"
+        "app": "ariadne"
       },
       "ports": [
         {
           "name": "http",
-          "port": 8080,
-          "targetPort": 8080,
+          "port": 80,
+          "targetPort": "http",
           "protocol": "TCP"
         }
       ]
@@ -1959,6 +2341,12 @@
           "port": 5432,
           "targetPort": 5432,
           "protocol": "TCP"
+        },
+        {
+          "name": "metrics",
+          "port": 9187,
+          "targetPort": 9187,
+          "protocol": "TCP"
         }
       ]
     },
@@ -2032,6 +2420,28 @@
         }
       ]
     },
+    {
+      "namespace": "traefik",
+      "name": "traefik",
+      "type": "LoadBalancer",
+      "selector": {
+        "app": "traefik"
+      },
+      "ports": [
+        {
+          "name": "web",
+          "port": 80,
+          "targetPort": "web",
+          "protocol": "TCP"
+        },
+        {
+          "name": "websecure",
+          "port": 443,
+          "targetPort": "websecure",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "traefik",
       "name": "traefik-metrics",
@@ -2210,6 +2620,26 @@
         "source": "bstein-dev-home"
       }
     },
+    {
+      "host": "budget.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "actual-budget",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "actual-budget"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "actual-budget",
+        "source": "finance"
+      }
+    },
     {
       "host": "call.live.bstein.dev",
       "path": "/",
@@ -2290,6 +2720,26 @@
         "source": "nextcloud"
       }
     },
+    {
+      "host": "health.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "health",
+        "service": "wger",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "wger"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "wger",
+        "source": "health"
+      }
+    },
     {
       "host": "kit.live.bstein.dev",
       "path": "/livekit/jwt",
@@ -2385,6 +2835,106 @@
         "source": "comms"
       }
     },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/r0/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/login",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/logout",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/refresh",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
     {
       "host": "logs.bstein.dev",
       "path": "/",
@@ -2650,6 +3200,26 @@
         "source": "monerod"
       }
     },
+    {
+      "host": "money.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "firefly",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "firefly"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "firefly",
+        "source": "finance"
+      }
+    },
     {
       "host": "notes.bstein.dev",
       "path": "/",
@@ -2838,7 +3408,6 @@
       "matrix.live.bstein.dev"
     ],
     "comms:comms/othrys-synapse": [
-      "bstein.dev",
       "kit.live.bstein.dev",
       "live.bstein.dev",
       "matrix.live.bstein.dev",
@@ -2853,6 +3422,9 @@
     "logging:logging/data-prepper": [
       "registry.bstein.dev"
     ],
+    "longhorn:longhorn-system/longhorn": [
+      "registry.bstein.dev"
+    ],
     "mailu:mailu-mailserver/mailu": [
       "bstein.dev",
       "mail.bstein.dev"
@@ -2862,8 +3434,12 @@
     ],
     "monitoring:monitoring/grafana": [
       "bstein.dev",
+      "mail.bstein.dev",
       "metrics.bstein.dev",
       "sso.bstein.dev"
+    ],
+    "monitoring:monitoring/kube-state-metrics": [
+      "atlas.bstein.dev"
     ]
   }
 }
diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml
index f3e04a8..637b5f9 100644
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
@@ -8,6 +8,15 @@ sources:
 - name: bstein-dev-home
   path: services/bstein-dev-home
   targetNamespace: bstein-dev-home
+- name: bstein-dev-home-migrations
+  path: services/bstein-dev-home/migrations
+  targetNamespace: bstein-dev-home
+- name: cert-manager
+  path: infrastructure/cert-manager
+  targetNamespace: cert-manager
+- name: cert-manager-cleanup
+  path: infrastructure/cert-manager/cleanup
+  targetNamespace: cert-manager
 - name: comms
   path: services/comms
   targetNamespace: comms
@@ -17,6 +26,9 @@ sources:
 - name: crypto
   path: services/crypto
   targetNamespace: crypto
+- name: finance
+  path: services/finance
+  targetNamespace: finance
 - name: flux-system
   path: clusters/atlas/flux-system
   targetNamespace: null
@@ -29,6 +41,9 @@ sources:
 - name: harbor
   path: services/harbor
   targetNamespace: harbor
+- name: health
+  path: services/health
+  targetNamespace: health
 - name: helm
   path: infrastructure/sources/helm
   targetNamespace: flux-system
@@ -44,6 +59,12 @@ sources:
 - name: logging
   path: services/logging
   targetNamespace: null
+- name: longhorn
+  path: infrastructure/longhorn/core
+  targetNamespace: longhorn-system
+- name: longhorn-adopt
+  path: infrastructure/longhorn/adopt
+  targetNamespace: longhorn-system
 - name: longhorn-ui
   path: infrastructure/longhorn/ui-ingress
   targetNamespace: longhorn-system
@@ -98,9 +119,15 @@ sources:
 - name: vault-csi
   path: infrastructure/vault-csi
   targetNamespace: kube-system
+- name: vault-injector
+  path: infrastructure/vault-injector
+  targetNamespace: vault
 - name: vaultwarden
   path: services/vaultwarden
   targetNamespace: vaultwarden
+- name: wallet-monero-temp
+  path: services/crypto/wallet-monero-temp
+  targetNamespace: crypto
 - name: xmr-miner
   path: services/crypto/xmr-miner
   targetNamespace: crypto
@@ -124,7 +151,7 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157
 - kind: Deployment
   namespace: bstein-dev-home
   name: bstein-dev-home-frontend
@@ -135,13 +162,22 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157
+- kind: Deployment
+  namespace: bstein-dev-home
+  name: bstein-dev-home-vault-sync
+  labels:
+    app: bstein-dev-home-vault-sync
+  serviceAccountName: bstein-dev-home-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: bstein-dev-home
   name: chat-ai-gateway
   labels:
     app: chat-ai-gateway
-  serviceAccountName: null
+  serviceAccountName: bstein-dev-home
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -157,12 +193,21 @@ workloads:
     hardware: rpi5
   images:
   - python:3.11-slim
+- kind: Deployment
+  namespace: comms
+  name: comms-vault-sync
+  labels:
+    app: comms-vault-sync
+  serviceAccountName: comms-vault
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: comms
   name: coturn
   labels:
     app: coturn
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -182,7 +227,7 @@ workloads:
   name: livekit
   labels:
     app: livekit
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -192,17 +237,17 @@ workloads:
   name: livekit-token-service
   labels:
     app: livekit-token-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
-  - ghcr.io/element-hq/lk-jwt-service:0.3.0
+  - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0
 - kind: Deployment
   namespace: comms
   name: matrix-authentication-service
   labels:
     app: matrix-authentication-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -212,7 +257,7 @@ workloads:
   name: matrix-guest-register
   labels:
     app.kubernetes.io/name: matrix-guest-register
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector: {}
   images:
   - python:3.11-slim
@@ -235,12 +280,21 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
+- kind: Deployment
+  namespace: crypto
+  name: crypto-vault-sync
+  labels:
+    app: crypto-vault-sync
+  serviceAccountName: crypto-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: crypto
   name: monero-p2pool
   labels:
     app: monero-p2pool
-  serviceAccountName: null
+  serviceAccountName: crypto-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -255,6 +309,38 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - registry.bstein.dev/crypto/monerod:0.18.4.1
+- kind: Deployment
+  namespace: crypto
+  name: wallet-monero-temp
+  labels:
+    app: wallet-monero-temp
+  serviceAccountName: crypto-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1
+- kind: Deployment
+  namespace: finance
+  name: actual-budget
+  labels:
+    app: actual-budget
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d
+- kind: Deployment
+  namespace: finance
+  name: firefly
+  labels:
+    app: firefly
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - fireflyiii/core:version-6.4.15
 - kind: Deployment
   namespace: flux-system
   name: helm-controller
@@ -344,17 +430,38 @@ workloads:
   name: gitea
   labels:
     app: gitea
-  serviceAccountName: null
+  serviceAccountName: gitea-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
   - gitea/gitea:1.23
+- kind: Deployment
+  namespace: harbor
+  name: harbor-vault-sync
+  labels:
+    app: harbor-vault-sync
+  serviceAccountName: harbor-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
+- kind: Deployment
+  namespace: health
+  name: wger
+  labels:
+    app: wger
+  serviceAccountName: health-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10
+  - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5
 - kind: Deployment
   namespace: jellyfin
   name: jellyfin
   labels:
     app: jellyfin
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector: {}
   images:
   - docker.io/jellyfin/jellyfin:10.11.5
@@ -363,13 +470,22 @@ workloads:
   name: pegasus
   labels:
     app: pegasus
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
   - alpine:3.20
-  - registry.bstein.dev/streaming/pegasus:1.2.32
+  - registry.bstein.dev/streaming/pegasus-vault:1.2.32
+- kind: Deployment
+  namespace: jellyfin
+  name: pegasus-vault-sync
+  labels:
+    app: pegasus-vault-sync
+  serviceAccountName: pegasus-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: jenkins
   name: jenkins
@@ -381,6 +497,26 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - jenkins/jenkins:2.528.3-jdk21
+- kind: Deployment
+  namespace: jenkins
+  name: jenkins-vault-sync
+  labels:
+    app: jenkins-vault-sync
+  serviceAccountName: jenkins-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: kube-system
+  name: ntp-sync
+  labels:
+    app: ntp-sync
+  serviceAccountName: null
+  nodeSelector: {}
+  images:
+  - public.ecr.aws/docker/library/busybox:1.36.1
 - kind: DaemonSet
   namespace: kube-system
   name: nvidia-device-plugin-jetson
@@ -427,6 +563,16 @@ workloads:
     kubernetes.io/os: linux
   images:
   - hashicorp/vault-csi-provider:1.7.0
+- kind: Deployment
+  namespace: kube-system
+  name: coredns
+  labels:
+    k8s-app: kube-dns
+  serviceAccountName: coredns
+  nodeSelector:
+    kubernetes.io/os: linux
+  images:
+  - registry.bstein.dev/infra/coredns:1.12.1
 - kind: DaemonSet
   namespace: logging
   name: node-image-gc-rpi4
@@ -457,22 +603,41 @@ workloads:
     hardware: rpi5
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: logging
+  name: logging-vault-sync
+  labels:
+    app: logging-vault-sync
+  serviceAccountName: logging-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: logging
   name: oauth2-proxy-logs
   labels:
     app: oauth2-proxy-logs
-  serviceAccountName: null
+  serviceAccountName: logging-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: longhorn-system
+  name: longhorn-vault-sync
+  labels:
+    app: longhorn-vault-sync
+  serviceAccountName: longhorn-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: longhorn-system
   name: oauth2-proxy-longhorn
   labels:
     app: oauth2-proxy-longhorn
-  serviceAccountName: null
+  serviceAccountName: longhorn-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -489,13 +654,34 @@ workloads:
   - registry.bstein.dev/bstein/kubectl:1.35.0
 - kind: Deployment
   namespace: mailu-mailserver
-  name: mailu-sync-listener
+  name: mailu-vault-sync
   labels:
-    app: mailu-sync-listener
-  serviceAccountName: null
+    app: mailu-vault-sync
+  serviceAccountName: mailu-vault-sync
   nodeSelector: {}
   images:
-  - python:3.11-alpine
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: maintenance
+  name: disable-k3s-traefik
+  labels:
+    app: disable-k3s-traefik
+  serviceAccountName: disable-k3s-traefik
+  nodeSelector:
+    node-role.kubernetes.io/control-plane: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: DaemonSet
+  namespace: maintenance
+  name: k3s-agent-restart
+  labels:
+    app: k3s-agent-restart
+  serviceAccountName: node-nofile
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
 - kind: DaemonSet
   namespace: maintenance
   name: node-image-sweeper
@@ -515,6 +701,26 @@ workloads:
   nodeSelector: {}
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: maintenance
+  name: ariadne
+  labels:
+    app: ariadne
+  serviceAccountName: ariadne
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/bstein/ariadne:0.1.0-49
+- kind: Deployment
+  namespace: maintenance
+  name: maintenance-vault-sync
+  labels:
+    app: maintenance-vault-sync
+  serviceAccountName: maintenance-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: DaemonSet
   namespace: monitoring
   name: dcgm-exporter
@@ -534,12 +740,21 @@ workloads:
     jetson: 'true'
   images:
   - python:3.10-slim
+- kind: Deployment
+  namespace: monitoring
+  name: monitoring-vault-sync
+  labels:
+    app: monitoring-vault-sync
+  serviceAccountName: monitoring-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: monitoring
   name: postmark-exporter
   labels:
     app: postmark-exporter
-  serviceAccountName: null
+  serviceAccountName: monitoring-vault-sync
   nodeSelector: {}
   images:
   - python:3.12-alpine
@@ -558,7 +773,7 @@ workloads:
   name: nextcloud
   labels:
     app: nextcloud
-  serviceAccountName: null
+  serviceAccountName: nextcloud-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -568,7 +783,7 @@ workloads:
   name: outline
   labels:
     app: outline
-  serviceAccountName: null
+  serviceAccountName: outline-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -588,7 +803,7 @@ workloads:
   name: planka
   labels:
     app: planka
-  serviceAccountName: null
+  serviceAccountName: planka-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -603,13 +818,16 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - postgres:15
+  - quay.io/prometheuscommunity/postgres-exporter:v0.15.0
 - kind: Deployment
   namespace: sso
   name: keycloak
   labels:
     app: keycloak
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: sso-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
   - quay.io/keycloak/keycloak:26.0.7
 - kind: Deployment
@@ -617,17 +835,26 @@ workloads:
   name: oauth2-proxy
   labels:
     app: oauth2-proxy
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: sso
+  name: sso-vault-sync
+  labels:
+    app: sso-vault-sync
+  serviceAccountName: sso-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: StatefulSet
   namespace: sso
   name: openldap
   labels:
     app: openldap
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -640,7 +867,7 @@ workloads:
     app: sui-metrics
   serviceAccountName: sui-metrics
   nodeSelector:
-    kubernetes.io/hostname: titan-24
+    hardware: rpi5
   images:
   - victoriametrics/vmagent:v1.103.0
 - kind: Deployment
@@ -648,6 +875,8 @@ workloads:
   name: traefik
   labels:
     app: traefik
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
   serviceAccountName: traefik-ingress-controller
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
@@ -669,10 +898,12 @@ workloads:
   name: vaultwarden
   labels:
     app: vaultwarden
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: vaultwarden-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
-  - vaultwarden/server:1.33.2
+  - vaultwarden/server:1.35.2
 services:
 - namespace: ai
   name: ollama
@@ -1040,6 +1271,36 @@ services:
     port: 3333
     targetPort: 3333
     protocol: TCP
+- namespace: crypto
+  name: wallet-monero-temp
+  type: ClusterIP
+  selector:
+    app: wallet-monero-temp
+  ports:
+  - name: rpc
+    port: 18083
+    targetPort: 18083
+    protocol: TCP
+- namespace: finance
+  name: actual-budget
+  type: ClusterIP
+  selector:
+    app: actual-budget
+  ports:
+  - name: http
+    port: 80
+    targetPort: 5006
+    protocol: TCP
+- namespace: finance
+  name: firefly
+  type: ClusterIP
+  selector:
+    app: firefly
+  ports:
+  - name: http
+    port: 80
+    targetPort: 8080
+    protocol: TCP
 - namespace: flux-system
   name: notification-controller
   type: ClusterIP
@@ -1082,7 +1343,7 @@ services:
     protocol: TCP
 - namespace: gitea
   name: gitea-ssh
-  type: NodePort
+  type: LoadBalancer
   selector:
     app: gitea
   ports:
@@ -1090,6 +1351,16 @@ services:
     port: 2242
     targetPort: 2242
     protocol: TCP
+- namespace: health
+  name: wger
+  type: ClusterIP
+  selector:
+    app: wger
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
+    protocol: TCP
 - namespace: jellyfin
   name: jellyfin
   type: ClusterIP
@@ -1124,21 +1395,6 @@ services:
     port: 50000
     targetPort: 50000
     protocol: TCP
-- namespace: kube-system
-  name: traefik
-  type: LoadBalancer
-  selector:
-    app.kubernetes.io/instance: traefik-kube-system
-    app.kubernetes.io/name: traefik
-  ports:
-  - name: web
-    port: 80
-    targetPort: web
-    protocol: TCP
-  - name: websecure
-    port: 443
-    targetPort: websecure
-    protocol: TCP
 - namespace: logging
   name: oauth2-proxy-logs
   type: ClusterIP
@@ -1191,15 +1447,15 @@ services:
     port: 4190
     targetPort: 4190
     protocol: TCP
-- namespace: mailu-mailserver
-  name: mailu-sync-listener
+- namespace: maintenance
+  name: ariadne
   type: ClusterIP
   selector:
-    app: mailu-sync-listener
+    app: ariadne
   ports:
   - name: http
-    port: 8080
-    targetPort: 8080
+    port: 80
+    targetPort: http
     protocol: TCP
 - namespace: monitoring
   name: dcgm-exporter
@@ -1291,6 +1547,10 @@ services:
     port: 5432
     targetPort: 5432
     protocol: TCP
+  - name: metrics
+    port: 9187
+    targetPort: 9187
+    protocol: TCP
 - namespace: sso
   name: keycloak
   type: ClusterIP
@@ -1335,6 +1595,20 @@ services:
     port: 8429
     targetPort: 8429
     protocol: TCP
+- namespace: traefik
+  name: traefik
+  type: LoadBalancer
+  selector:
+    app: traefik
+  ports:
+  - name: web
+    port: 80
+    targetPort: web
+    protocol: TCP
+  - name: websecure
+    port: 443
+    targetPort: websecure
+    protocol: TCP
 - namespace: traefik
   name: traefik-metrics
   type: ClusterIP
@@ -1447,6 +1721,19 @@ http_endpoints:
     kind: Ingress
     name: bstein-dev-home
     source: bstein-dev-home
+- host: budget.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: actual-budget
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: actual-budget
+  via:
+    kind: Ingress
+    name: actual-budget
+    source: finance
 - host: call.live.bstein.dev
   path: /
   backend:
@@ -1499,6 +1786,19 @@ http_endpoints:
     kind: Ingress
     name: nextcloud
     source: nextcloud
+- host: health.bstein.dev
+  path: /
+  backend:
+    namespace: health
+    service: wger
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: wger
+  via:
+    kind: Ingress
+    name: wger
+    source: health
 - host: kit.live.bstein.dev
   path: /livekit/jwt
   backend:
@@ -1558,6 +1858,65 @@ http_endpoints:
     kind: Ingress
     name: matrix-routing
     source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/r0/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: &id003
+    - kind: Deployment
+      name: matrix-guest-register
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/login
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: &id002
+    - kind: Deployment
+      name: matrix-authentication-service
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/logout
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/refresh
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: *id003
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
 - host: logs.bstein.dev
   path: /
   backend:
@@ -1601,9 +1960,7 @@ http_endpoints:
     namespace: comms
     service: matrix-authentication-service
     port: 8080
-    workloads: &id002
-    - kind: Deployment
-      name: matrix-authentication-service
+    workloads: *id002
   via:
     kind: Ingress
     name: matrix-routing
@@ -1647,9 +2004,7 @@ http_endpoints:
     namespace: comms
     service: matrix-guest-register
     port: 8080
-    workloads: &id003
-    - kind: Deployment
-      name: matrix-guest-register
+    workloads: *id003
   via:
     kind: Ingress
     name: matrix-routing
@@ -1722,6 +2077,19 @@ http_endpoints:
     kind: Ingress
     name: monerod
     source: monerod
+- host: money.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: firefly
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: firefly
+  via:
+    kind: Ingress
+    name: firefly
+    source: finance
 - host: notes.bstein.dev
   path: /
   backend:
@@ -1845,7 +2213,6 @@ helmrelease_host_hints:
   - live.bstein.dev
   - matrix.live.bstein.dev
   comms:comms/othrys-synapse:
-  - bstein.dev
   - kit.live.bstein.dev
   - live.bstein.dev
   - matrix.live.bstein.dev
@@ -1856,6 +2223,8 @@ helmrelease_host_hints:
   - registry.bstein.dev
   logging:logging/data-prepper:
   - registry.bstein.dev
+  longhorn:longhorn-system/longhorn:
+  - registry.bstein.dev
   mailu:mailu-mailserver/mailu:
   - bstein.dev
   - mail.bstein.dev
@@ -1863,5 +2232,8 @@ helmrelease_host_hints:
   - alerts.bstein.dev
   monitoring:monitoring/grafana:
   - bstein.dev
+  - mail.bstein.dev
   - metrics.bstein.dev
   - sso.bstein.dev
+  monitoring:monitoring/kube-state-metrics:
+  - atlas.bstein.dev
diff --git a/knowledge/catalog/metrics.json b/knowledge/catalog/metrics.json
new file mode 100644
index 0000000..e929db5
--- /dev/null
+++ b/knowledge/catalog/metrics.json
@@ -0,0 +1,1880 @@
+[
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 1,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Namespace",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Node",
+    "panel_id": 3,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Top Pods by GPU Util",
+    "panel_id": 4,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (range)",
+    "panel_id": 1,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 3,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Stale (>36h)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Missing Success",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Suspended",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (24h)",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Runs (1h)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Error (hours ago)",
+    "panel_id": 10,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Success (hours ago)",
+    "panel_id": 11,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Success (hours ago)",
+    "panel_id": 12,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Schedule (hours ago)",
+    "panel_id": 13,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 14,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (30d)",
+    "panel_id": 15,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Access Requests",
+    "panel_id": 16,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(ariadne_access_requests_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Coverage (%)",
+    "panel_id": 17,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_coverage_percent{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Tests (latest)",
+    "panel_id": 18,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_tests_total{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (7d)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"7d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Success Rate (1d)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Limit Used (30d)",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Send Limit (30d)",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Last Success",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_last_success_timestamp_seconds)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounce Rate (1d vs 7d)",
+    "panel_id": 13,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounce_rate)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounced (1d vs 7d)",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounced)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d vs 7d)",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_sent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Success Rate (5m)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (1h)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (6h)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Edge P99 Latency (ms)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Traffic",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Egress Traffic",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Intra-Cluster Traffic",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Per-Node Throughput",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Namespaces",
+    "panel_id": 9,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Pods",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Routers (req/s)",
+    "panel_id": 11,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Entrypoints (req/s)",
+    "panel_id": 12,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Worker Nodes Ready",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server 5xx rate",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server P99 latency",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "etcd P99 latency",
+    "panel_id": 11,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node CPU",
+    "panel_id": 4,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node RAM",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) CPU",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) RAM",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Stuck Terminating",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Atlas Availability",
+    "panel_id": 27,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Problem Pods",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Workers Ready",
+    "panel_id": 1,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: CPU",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: RAM",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: NET (rx+tx)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: I/O (r+w)",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Sent (1d)",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Success Rate (1d)",
+    "panel_id": 32,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Limit Used (30d)",
+    "panel_id": 33,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Connections Used",
+    "panel_id": 34,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Hottest Connections",
+    "panel_id": 35,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(1, sum by (datname) (pg_stat_activity_count))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Usage",
+    "panel_id": 23,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Usage",
+    "panel_id": 24,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Free",
+    "panel_id": 25,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Free",
+    "panel_id": 26,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 40,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 41,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Test Success Rate",
+    "panel_id": 42,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Tests with Failures (24h)",
+    "panel_id": 43,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace CPU Share",
+    "panel_id": 11,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 12,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace RAM Share",
+    "panel_id": 13,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node CPU",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node RAM",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane CPU",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane RAM",
+    "panel_id": 17,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Node Pod Share",
+    "panel_id": 28,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 29,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Ingress Throughput",
+    "panel_id": 18,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Egress Throughput",
+    "panel_id": 19,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Intra-Cluster Throughput",
+    "panel_id": 20,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 21,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Nodes Closest to Full Root Disks",
+    "panel_id": 22,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Problem Pods",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Stuck Terminating (>10m)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Pods Not Running",
+    "panel_id": 5,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Terminating >10m",
+    "panel_id": 7,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Node Pod Share",
+    "panel_id": 8,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 9,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Namespace Plurality by Node v27",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Free",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Free",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Per-Node Usage",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Per-Node Usage",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage History",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage History",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Sweepers Ready",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Cron Freshness (s)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})"
+    ]
+  }
+]
diff --git a/knowledge/catalog/runbooks.json b/knowledge/catalog/runbooks.json
index 0718562..960510d 100644
--- a/knowledge/catalog/runbooks.json
+++ b/knowledge/catalog/runbooks.json
@@ -85,5 +85,13 @@
       "clusters/atlas/<...>"
     ],
     "body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
+  },
+  {
+    "path": "software/metis.md",
+    "title": "metis",
+    "tags": [],
+    "entrypoints": [],
+    "source_paths": [],
+    "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2"
   }
 ]
diff --git a/knowledge/diagrams/atlas-http.mmd b/knowledge/diagrams/atlas-http.mmd
index ab7c362..1aa7ac8 100644
--- a/knowledge/diagrams/atlas-http.mmd
+++ b/knowledge/diagrams/atlas-http.mmd
@@ -17,6 +17,11 @@ flowchart LR
   host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
   wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
   svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
+  host_budget_bstein_dev["budget.bstein.dev"]
+  svc_finance_actual_budget["finance/actual-budget (Service)"]
+  host_budget_bstein_dev --> svc_finance_actual_budget
+  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
+  svc_finance_actual_budget --> wl_finance_actual_budget
   host_call_live_bstein_dev["call.live.bstein.dev"]
   svc_comms_element_call["comms/element-call (Service)"]
   host_call_live_bstein_dev --> svc_comms_element_call
@@ -37,6 +42,11 @@ flowchart LR
   host_cloud_bstein_dev --> svc_nextcloud_nextcloud
   wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
   svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
+  host_health_bstein_dev["health.bstein.dev"]
+  svc_health_wger["health/wger (Service)"]
+  host_health_bstein_dev --> svc_health_wger
+  wl_health_wger["health/wger (Deployment)"]
+  svc_health_wger --> wl_health_wger
   host_kit_live_bstein_dev["kit.live.bstein.dev"]
   svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
   host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@@ -50,6 +60,14 @@ flowchart LR
   host_live_bstein_dev --> svc_comms_matrix_wellknown
   svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
   host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_logs_bstein_dev["logs.bstein.dev"]
   svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
   host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
@@ -64,21 +82,20 @@ flowchart LR
   svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
   host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
   host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
-  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
-  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
-  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
   host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
-  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
-  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
   host_monero_bstein_dev["monero.bstein.dev"]
   svc_crypto_monerod["crypto/monerod (Service)"]
   host_monero_bstein_dev --> svc_crypto_monerod
   wl_crypto_monerod["crypto/monerod (Deployment)"]
   svc_crypto_monerod --> wl_crypto_monerod
+  host_money_bstein_dev["money.bstein.dev"]
+  svc_finance_firefly["finance/firefly (Service)"]
+  host_money_bstein_dev --> svc_finance_firefly
+  wl_finance_firefly["finance/firefly (Deployment)"]
+  svc_finance_firefly --> wl_finance_firefly
   host_notes_bstein_dev["notes.bstein.dev"]
   svc_outline_outline["outline/outline (Service)"]
   host_notes_bstein_dev --> svc_outline_outline
@@ -143,19 +160,29 @@ flowchart LR
     svc_comms_livekit
     wl_comms_livekit
     svc_comms_othrys_synapse_matrix_synapse
-    svc_comms_matrix_authentication_service
-    wl_comms_matrix_authentication_service
     svc_comms_matrix_guest_register
     wl_comms_matrix_guest_register
+    svc_comms_matrix_authentication_service
+    wl_comms_matrix_authentication_service
   end
   subgraph crypto[crypto]
     svc_crypto_monerod
     wl_crypto_monerod
   end
+  subgraph finance[finance]
+    svc_finance_actual_budget
+    wl_finance_actual_budget
+    svc_finance_firefly
+    wl_finance_firefly
+  end
   subgraph gitea[gitea]
     svc_gitea_gitea
     wl_gitea_gitea
   end
+  subgraph health[health]
+    svc_health_wger
+    wl_health_wger
+  end
   subgraph jellyfin[jellyfin]
     svc_jellyfin_pegasus
     wl_jellyfin_pegasus
diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 0931b48..5db798d 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -70,6 +70,7 @@ WORKER_NODES = [
     "titan-13",
     "titan-14",
     "titan-15",
+    "titan-16",
     "titan-17",
     "titan-18",
     "titan-19",
@@ -207,7 +208,66 @@ def namespace_ram_raw(scope_var):
 
 
 def namespace_gpu_usage_instant(scope_var):
-    return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
+    return gpu_usage_by_namespace(scope_var)
+
+
+def jetson_gpu_util_by_node():
+    return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
+
+
+def dcgm_gpu_util_by_node():
+    dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
+    dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
+    return (
+        "avg by (node) ("
+        f"{dcgm_ns} * on(namespace,pod) group_left(node) "
+        'kube_pod_info{namespace="monitoring"}'
+        ")"
+    )
+
+
+def gpu_util_by_node():
+    return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
+
+
+def gpu_util_by_hostname():
+    return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
+
+
+def gpu_node_labels():
+    return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
+
+
+def gpu_requests_by_namespace_node(scope_var):
+    return (
+        "sum by (namespace,node) ("
+        f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
+        "* on(namespace,pod) group_left(node) kube_pod_info "
+        f"* on(node) group_left() ({gpu_node_labels()})"
+        ")"
+    )
+
+
+def gpu_usage_by_namespace(scope_var):
+    requests_by_ns = gpu_requests_by_namespace_node(scope_var)
+    total_by_node = f"sum by (node) ({requests_by_ns})"
+    return (
+        "sum by (namespace) ("
+        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
+        f"* on(node) group_left() ({gpu_util_by_node()})"
+        ")"
+    )
+
+
+def jetson_gpu_usage_by_namespace(scope_var):
+    requests_by_ns = jetson_gpu_requests(scope_var)
+    total_by_node = f"sum by (node) ({requests_by_ns})"
+    return (
+        "sum by (namespace) ("
+        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
+        f"* on(node) group_left() {jetson_gpu_util_by_node()}"
+        ")"
+    )
 
 
 def namespace_share_expr(resource_expr):
@@ -227,7 +287,7 @@ def namespace_gpu_share_expr(scope_var):
     usage = namespace_gpu_usage_instant(scope_var)
     total = f"(sum({usage}) or on() vector(0))"
     share = f"100 * ({usage}) / clamp_min({total}, 1)"
-    idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
+    idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
     return f"({share}) or ({idle})"
 
 
@@ -333,9 +393,60 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
 GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
 GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
 GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
-GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
-GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
-GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
+GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
+GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
+GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
+ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
+ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
+ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
+ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
+ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
+ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
+ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
+ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
+ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
+ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
+ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
+ARIADNE_TASK_WARNINGS_SERIES = (
+    'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
+)
+ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
+ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
+ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
+    "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
+)
+ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
+    "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
+)
+ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
+ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
+ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
+ARIADNE_TEST_SUCCESS_RATE = (
+    "100 * "
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
+    "/ clamp_min("
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
+)
+ARIADNE_TEST_FAILURES_24H = (
+    'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
+)
+POSTGRES_CONN_USED = (
+    'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
+    'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
+)
+POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
+ONEOFF_JOB_OWNER = (
+    'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
+)
+ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
+ONEOFF_JOB_POD_AGE_HOURS = (
+    '((time() - kube_pod_start_time{pod!=""}) / 3600) '
+    f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
+    '* on(namespace,pod) group_left(phase) '
+    'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
+)
+GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
+GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@@ -513,6 +624,7 @@ def timeseries_panel(
     grid,
     *,
     unit="none",
+    max_value=None,
     legend=None,
     legend_display="table",
     legend_placement="bottom",
@@ -537,6 +649,8 @@ def timeseries_panel(
             "tooltip": {"mode": "multi"},
         },
     }
+    if max_value is not None:
+        panel["fieldConfig"]["defaults"]["max"] = max_value
     if legend:
         panel["targets"][0]["legendFormat"] = legend
     if legend_calcs:
@@ -688,13 +802,22 @@ def bargauge_panel(
     grid,
     *,
     unit="none",
+    legend=None,
     links=None,
     limit=None,
+    sort_order="desc",
     thresholds=None,
     decimals=None,
     instant=False,
+    overrides=None,
 ):
     """Return a bar gauge panel with label-aware reduction."""
+    cleaned_expr = expr.strip()
+    if not cleaned_expr.startswith(("sort(", "sort_desc(")):
+        if sort_order == "desc":
+            expr = f"sort_desc({expr})"
+        elif sort_order == "asc":
+            expr = f"sort({expr})"
     panel = {
         "id": panel_id,
         "type": "bargauge",
@@ -702,7 +825,12 @@ def bargauge_panel(
         "datasource": PROM_DS,
         "gridPos": grid,
         "targets": [
-            {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
+            {
+                "expr": expr,
+                "refId": "A",
+                "legendFormat": legend or "{{node}}",
+                **({"instant": True} if instant else {}),
+            }
         ],
         "fieldConfig": {
             "defaults": {
@@ -732,6 +860,8 @@ def bargauge_panel(
             },
         },
     }
+    if overrides:
+        panel["fieldConfig"]["overrides"].extend(overrides)
     if decimals is not None:
         panel["fieldConfig"]["defaults"]["decimals"] = decimals
     if links:
@@ -740,7 +870,7 @@ def bargauge_panel(
     panel["transformations"] = [
         {
             "id": "sortBy",
-            "options": {"fields": ["Value"], "order": "desc"},
+            "options": {"fields": ["Value"], "order": sort_order},
         }
     ]
     if limit:
@@ -780,6 +910,15 @@ def build_overview():
             {"color": "red", "value": 3},
         ],
     }
+    age_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 6},
+            {"color": "orange", "value": 24},
+            {"color": "red", "value": 48},
+        ],
+    }
 
     row1_stats = [
         {
@@ -982,7 +1121,7 @@ def build_overview():
             30,
             "Mail Sent (1d)",
             'max(postmark_outbound_sent{window="1d"})',
-            {"h": 2, "w": 5, "x": 0, "y": 8},
+            {"h": 3, "w": 4, "x": 0, "y": 8},
             unit="none",
             links=link_to("atlas-mail"),
         )
@@ -993,7 +1132,7 @@ def build_overview():
             "type": "stat",
             "title": "Mail Bounces (1d)",
             "datasource": PROM_DS,
-            "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8},
+            "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
             "targets": [
                 {
                     "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@@ -1039,7 +1178,7 @@ def build_overview():
             32,
             "Mail Success Rate (1d)",
             'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
-            {"h": 2, "w": 5, "x": 5, "y": 8},
+            {"h": 3, "w": 4, "x": 4, "y": 8},
             unit="percent",
             thresholds=mail_success_thresholds,
             decimals=1,
@@ -1051,13 +1190,38 @@ def build_overview():
             33,
             "Mail Limit Used (30d)",
             "max(postmark_sending_limit_used_percent)",
-            {"h": 2, "w": 5, "x": 15, "y": 8},
+            {"h": 3, "w": 4, "x": 12, "y": 8},
             unit="percent",
             thresholds=mail_limit_thresholds,
             decimals=1,
             links=link_to("atlas-mail"),
         )
     )
+    panels.append(
+        stat_panel(
+            34,
+            "Postgres Connections Used",
+            POSTGRES_CONN_USED,
+            {"h": 3, "w": 4, "x": 16, "y": 8},
+            decimals=0,
+            text_mode="name_and_value",
+            legend="{{conn}}",
+            instant=True,
+        )
+    )
+    panels.append(
+        stat_panel(
+            35,
+            "Postgres Hottest Connections",
+            POSTGRES_CONN_HOTTEST,
+            {"h": 3, "w": 4, "x": 20, "y": 8},
+            unit="none",
+            decimals=0,
+            text_mode="name_and_value",
+            legend="{{datname}}",
+            instant=True,
+        )
+    )
 
     storage_panels = [
         (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
@@ -1071,13 +1235,104 @@ def build_overview():
                 panel_id,
                 title,
                 expr,
-                {"h": 6, "w": 6, "x": 6 * idx, "y": 10},
+                {"h": 3, "w": 6, "x": 6 * idx, "y": 11},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 links=link_to("atlas-storage"),
             )
         )
 
+    panels.append(
+        bargauge_panel(
+            40,
+            "One-off Job Pods (age hours)",
+            ONEOFF_JOB_POD_AGE_HOURS,
+            {"h": 6, "w": 6, "x": 0, "y": 14},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{pod}}",
+            thresholds=age_thresholds,
+            limit=8,
+            decimals=2,
+        )
+    )
+    panels.append(
+        {
+            "id": 41,
+            "type": "timeseries",
+            "title": "Ariadne Attempts / Failures",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
+            "targets": [
+                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
+                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
+            ],
+            "fieldConfig": {
+                "defaults": {"unit": "none"},
+                "overrides": [
+                    {
+                        "matcher": {"id": "byName", "options": "Attempts"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
+                        ],
+                    },
+                    {
+                        "matcher": {"id": "byName", "options": "Failures"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
+                        ],
+                    },
+                ],
+            },
+            "options": {
+                "legend": {"displayMode": "table", "placement": "right"},
+                "tooltip": {"mode": "multi"},
+            },
+        }
+    )
+    panels.append(
+        timeseries_panel(
+            42,
+            "Ariadne Test Success Rate",
+            ARIADNE_TEST_SUCCESS_RATE,
+            {"h": 6, "w": 6, "x": 12, "y": 14},
+            unit="percent",
+            max_value=100,
+            legend=None,
+            legend_display="list",
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            43,
+            "Tests with Failures (24h)",
+            ARIADNE_TEST_FAILURES_24H,
+            {"h": 6, "w": 6, "x": 18, "y": 14},
+            unit="none",
+            instant=True,
+            legend="{{result}}",
+            overrides=[
+                {
+                    "matcher": {"id": "byName", "options": "error"},
+                    "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
+                },
+                {
+                    "matcher": {"id": "byName", "options": "failed"},
+                    "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
+                },
+            ],
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "yellow", "value": 1},
+                    {"color": "orange", "value": 5},
+                    {"color": "red", "value": 10},
+                ],
+            },
+        )
+    )
+
     cpu_scope = "$namespace_scope_cpu"
     gpu_scope = "$namespace_scope_gpu"
     ram_scope = "$namespace_scope_ram"
@@ -1087,7 +1342,7 @@ def build_overview():
             11,
             "Namespace CPU Share",
             namespace_cpu_share_expr(cpu_scope),
-            {"h": 9, "w": 8, "x": 0, "y": 16},
+            {"h": 9, "w": 8, "x": 0, "y": 20},
             links=namespace_scope_links("namespace_scope_cpu"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1097,7 +1352,7 @@ def build_overview():
             12,
             "Namespace GPU Share",
             namespace_gpu_share_expr(gpu_scope),
-            {"h": 9, "w": 8, "x": 8, "y": 16},
+            {"h": 9, "w": 8, "x": 8, "y": 20},
             links=namespace_scope_links("namespace_scope_gpu"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1107,7 +1362,7 @@ def build_overview():
             13,
             "Namespace RAM Share",
             namespace_ram_share_expr(ram_scope),
-            {"h": 9, "w": 8, "x": 16, "y": 16},
+            {"h": 9, "w": 8, "x": 16, "y": 20},
             links=namespace_scope_links("namespace_scope_ram"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1119,7 +1374,7 @@ def build_overview():
             14,
             "Worker Node CPU",
             node_cpu_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 0, "y": 32},
+            {"h": 12, "w": 12, "x": 0, "y": 36},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1133,7 +1388,7 @@ def build_overview():
             15,
             "Worker Node RAM",
             node_mem_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 12, "y": 32},
+            {"h": 12, "w": 12, "x": 12, "y": 36},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1148,7 +1403,7 @@ def build_overview():
             16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 0, "y": 44},
+            {"h": 10, "w": 12, "x": 0, "y": 48},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -1160,7 +1415,7 @@ def build_overview():
             17,
             "Control plane RAM",
             node_mem_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 12, "y": 44},
+            {"h": 10, "w": 12, "x": 12, "y": 48},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -1173,7 +1428,7 @@ def build_overview():
             28,
             "Node Pod Share",
             '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
-            {"h": 10, "w": 12, "x": 0, "y": 54},
+            {"h": 10, "w": 12, "x": 0, "y": 58},
         )
     )
     panels.append(
@@ -1181,7 +1436,7 @@ def build_overview():
             29,
             "Top Nodes by Pod Count",
             'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
-            {"h": 10, "w": 12, "x": 12, "y": 54},
+            {"h": 10, "w": 12, "x": 12, "y": 58},
             unit="none",
             limit=12,
             decimals=0,
@@ -1203,7 +1458,7 @@ def build_overview():
             18,
             "Cluster Ingress Throughput",
             NET_INGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 0, "y": 25},
+            {"h": 7, "w": 8, "x": 0, "y": 29},
             unit="Bps",
             legend="Ingress (Traefik)",
             legend_display="list",
@@ -1216,7 +1471,7 @@ def build_overview():
             19,
             "Cluster Egress Throughput",
             NET_EGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 8, "y": 25},
+            {"h": 7, "w": 8, "x": 8, "y": 29},
             unit="Bps",
             legend="Egress (Traefik)",
             legend_display="list",
@@ -1229,7 +1484,7 @@ def build_overview():
             20,
             "Intra-Cluster Throughput",
             NET_INTERNAL_EXPR,
-            {"h": 7, "w": 8, "x": 16, "y": 25},
+            {"h": 7, "w": 8, "x": 16, "y": 29},
             unit="Bps",
             legend="Internal traffic",
             legend_display="list",
@@ -1243,7 +1498,7 @@ def build_overview():
             21,
             "Root Filesystem Usage",
             root_usage_expr(),
-            {"h": 16, "w": 12, "x": 0, "y": 64},
+            {"h": 16, "w": 12, "x": 0, "y": 68},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1258,7 +1513,7 @@ def build_overview():
             22,
             "Nodes Closest to Full Root Disks",
             f"topk(12, {root_usage_expr()})",
-            {"h": 16, "w": 12, "x": 12, "y": 64},
+            {"h": 16, "w": 12, "x": 12, "y": 68},
             unit="percent",
             thresholds=PERCENT_THRESHOLDS,
             links=link_to("atlas-storage"),
@@ -2153,16 +2408,103 @@ def build_mail_dashboard():
     }
 
 
-def build_testing_dashboard():
+def build_jobs_dashboard():
     panels = []
-    sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}]
+    age_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 6},
+            {"color": "orange", "value": 24},
+            {"color": "red", "value": 48},
+        ],
+    }
+    recent_error_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "red", "value": None},
+            {"color": "orange", "value": 1},
+            {"color": "yellow", "value": 6},
+            {"color": "green", "value": 24},
+        ],
+    }
+
+    task_error_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 1},
+            {"color": "orange", "value": 3},
+            {"color": "red", "value": 5},
+        ],
+    }
 
     panels.append(
-        stat_panel(
+        bargauge_panel(
             1,
+            "Ariadne Task Errors (range)",
+            ARIADNE_TASK_ERRORS_RANGE,
+            {"h": 7, "w": 8, "x": 0, "y": 0},
+            unit="none",
+            instant=True,
+            legend="{{task}}",
+            thresholds=task_error_thresholds,
+        )
+    )
+    panels.append(
+        {
+            "id": 2,
+            "type": "timeseries",
+            "title": "Ariadne Attempts / Failures",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
+            "targets": [
+                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
+                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
+            ],
+            "fieldConfig": {
+                "defaults": {"unit": "none"},
+                "overrides": [
+                    {
+                        "matcher": {"id": "byName", "options": "Attempts"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
+                        ],
+                    },
+                    {
+                        "matcher": {"id": "byName", "options": "Failures"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
+                        ],
+                    },
+                ],
+            },
+            "options": {
+                "legend": {"displayMode": "table", "placement": "right"},
+                "tooltip": {"mode": "multi"},
+            },
+        }
+    )
+    panels.append(
+        bargauge_panel(
+            3,
+            "One-off Job Pods (age hours)",
+            ONEOFF_JOB_POD_AGE_HOURS,
+            {"h": 7, "w": 8, "x": 16, "y": 0},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{pod}}",
+            thresholds=age_thresholds,
+            limit=12,
+            decimals=2,
+        )
+    )
+    panels.append(
+        stat_panel(
+            4,
             "Glue Jobs Stale (>36h)",
             GLUE_STALE_COUNT,
-            {"h": 4, "w": 6, "x": 0, "y": 0},
+            {"h": 4, "w": 4, "x": 0, "y": 7},
             unit="none",
             thresholds={
                 "mode": "absolute",
@@ -2176,64 +2518,164 @@ def build_testing_dashboard():
         )
     )
     panels.append(
-        table_panel(
-            2,
-            "Glue Jobs Missing Success",
-            GLUE_MISSING_ACTIVE,
-            {"h": 4, "w": 6, "x": 6, "y": 0},
-            unit="none",
-            transformations=sort_desc,
-            instant=True,
-        )
-    )
-    panels.append(
-        table_panel(
-            3,
-            "Glue Jobs Suspended",
-            GLUE_SUSPENDED,
-            {"h": 4, "w": 6, "x": 12, "y": 0},
-            unit="none",
-            transformations=sort_desc,
-            instant=True,
-        )
-    )
-    panels.append(
-        table_panel(
-            4,
-            "Glue Jobs Active Runs",
-            GLUE_ACTIVE,
-            {"h": 4, "w": 6, "x": 18, "y": 0},
-            unit="none",
-            transformations=sort_desc,
-            instant=True,
-        )
-    )
-    panels.append(
-        table_panel(
+        stat_panel(
             5,
-            "Glue Jobs Last Success (hours ago)",
-            GLUE_LAST_SUCCESS_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 0, "y": 4},
+            "Glue Jobs Missing Success",
+            GLUE_MISSING_COUNT,
+            {"h": 4, "w": 4, "x": 4, "y": 7},
+            unit="none",
+        )
+    )
+    panels.append(
+        stat_panel(
+            6,
+            "Glue Jobs Suspended",
+            GLUE_SUSPENDED_COUNT,
+            {"h": 4, "w": 4, "x": 8, "y": 7},
+            unit="none",
+        )
+    )
+    panels.append(
+        stat_panel(
+            7,
+            "Ariadne Task Errors (1h)",
+            ARIADNE_TASK_ERRORS_1H_TOTAL,
+            {"h": 4, "w": 4, "x": 12, "y": 7},
+            unit="none",
+        )
+    )
+    panels.append(
+        stat_panel(
+            8,
+            "Ariadne Task Errors (24h)",
+            ARIADNE_TASK_ERRORS_24H_TOTAL,
+            {"h": 4, "w": 4, "x": 16, "y": 7},
+            unit="none",
+        )
+    )
+    panels.append(
+        stat_panel(
+            9,
+            "Ariadne Task Runs (1h)",
+            ARIADNE_TASK_RUNS_1H_TOTAL,
+            {"h": 4, "w": 4, "x": 20, "y": 7},
+            unit="none",
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            10,
+            "Ariadne Schedule Last Error (hours ago)",
+            ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 0, "y": 17},
             unit="h",
-            transformations=sort_desc,
             instant=True,
+            legend="{{task}}",
+            thresholds=recent_error_thresholds,
+            decimals=2,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            11,
+            "Ariadne Schedule Last Success (hours ago)",
+            ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 12, "y": 17},
+            unit="h",
+            instant=True,
+            legend="{{task}}",
+            thresholds=age_thresholds,
+            decimals=2,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            12,
+            "Glue Jobs Last Success (hours ago)",
+            GLUE_LAST_SUCCESS_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 0, "y": 23},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{cronjob}}",
+            thresholds=age_thresholds,
+            decimals=2,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            13,
+            "Glue Jobs Last Schedule (hours ago)",
+            GLUE_LAST_SCHEDULE_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 12, "y": 23},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{cronjob}}",
+            thresholds=age_thresholds,
+            decimals=2,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            14,
+            "Ariadne Task Errors (1h)",
+            ARIADNE_TASK_ERRORS_1H,
+            {"h": 6, "w": 12, "x": 0, "y": 29},
+            unit="none",
+            instant=True,
+            legend="{{task}}",
+            thresholds=task_error_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            15,
+            "Ariadne Task Errors (30d)",
+            ARIADNE_TASK_ERRORS_30D,
+            {"h": 6, "w": 12, "x": 12, "y": 29},
+            unit="none",
+            instant=True,
+            legend="{{task}}",
+            thresholds=task_error_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            16,
+            "Ariadne Access Requests",
+            ARIADNE_ACCESS_REQUESTS,
+            {"h": 6, "w": 8, "x": 0, "y": 11},
+            unit="none",
+            instant=True,
+            legend="{{status}}",
+        )
+    )
+    panels.append(
+        stat_panel(
+            17,
+            "Ariadne CI Coverage (%)",
+            ARIADNE_CI_COVERAGE,
+            {"h": 6, "w": 4, "x": 8, "y": 11},
+            unit="percent",
+            decimals=1,
+            instant=True,
+            legend="{{branch}}",
         )
     )
     panels.append(
         table_panel(
-            6,
-            "Glue Jobs Last Schedule (hours ago)",
-            GLUE_LAST_SCHEDULE_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 12, "y": 4},
-            unit="h",
-            transformations=sort_desc,
+            18,
+            "Ariadne CI Tests (latest)",
+            ARIADNE_CI_TESTS,
+            {"h": 6, "w": 12, "x": 12, "y": 11},
+            unit="none",
+            transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
             instant=True,
         )
     )
 
     return {
-        "uid": "atlas-testing",
-        "title": "Atlas Testing",
+        "uid": "atlas-jobs",
+        "title": "Atlas Jobs",
         "folderUid": PRIVATE_FOLDER,
         "editable": True,
         "panels": panels,
@@ -2241,7 +2683,7 @@ def build_testing_dashboard():
         "annotations": {"list": []},
         "schemaVersion": 39,
         "style": "dark",
-        "tags": ["atlas", "testing"],
+        "tags": ["atlas", "jobs", "glue"],
     }
 
 
@@ -2274,7 +2716,7 @@ def build_gpu_dashboard():
         timeseries_panel(
             3,
             "GPU Util by Node",
-            'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
+            gpu_util_by_hostname(),
             {"h": 8, "w": 12, "x": 0, "y": 8},
             unit="percent",
             legend="{{Hostname}}",
@@ -2338,9 +2780,9 @@ DASHBOARDS = {
         "builder": build_mail_dashboard,
         "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
     },
-    "atlas-testing": {
-        "builder": build_testing_dashboard,
-        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml",
+    "atlas-jobs": {
+        "builder": build_jobs_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
     },
     "atlas-gpu": {
         "builder": build_gpu_dashboard,
diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py
index c7f9f26..1e305cb 100644
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@@ -20,11 +20,13 @@ import subprocess
 import sys
 from dataclasses import dataclass
 from pathlib import Path
+import shutil
 from typing import Any, Iterable
 
 import yaml
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
+DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"
 
 CLUSTER_SCOPED_KINDS = {
     "Namespace",
@@ -60,6 +62,70 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
     return res.stdout
 
 
+def _sync_tree(source: Path, dest: Path) -> None:
+    if dest.exists():
+        shutil.rmtree(dest)
+    shutil.copytree(source, dest)
+
+
+def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
+    panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
+    for panel in panels:
+        if not isinstance(panel, dict):
+            continue
+        if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
+            yield from _iter_dashboard_panels({"panels": panel.get("panels")})
+            continue
+        yield panel
+
+
+def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
+    index: list[dict[str, Any]] = []
+    for path in sorted(dashboard_dir.glob("*.json")):
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(data, dict):
+            continue
+        dash_title = data.get("title") or path.stem
+        dash_tags = data.get("tags") or []
+        for panel in _iter_dashboard_panels(data):
+            targets = panel.get("targets")
+            if not isinstance(targets, list):
+                continue
+            exprs: list[str] = []
+            for target in targets:
+                if not isinstance(target, dict):
+                    continue
+                expr = target.get("expr")
+                if isinstance(expr, str) and expr.strip():
+                    exprs.append(expr.strip())
+            if not exprs:
+                continue
+            datasource = panel.get("datasource") or {}
+            if isinstance(datasource, dict):
+                ds_uid = datasource.get("uid")
+                ds_type = datasource.get("type")
+            else:
+                ds_uid = None
+                ds_type = None
+            index.append(
+                {
+                    "dashboard": dash_title,
+                    "panel_title": panel.get("title") or "",
+                    "panel_id": panel.get("id"),
+                    "panel_type": panel.get("type"),
+                    "description": panel.get("description") or "",
+                    "tags": dash_tags,
+                    "datasource_uid": ds_uid,
+                    "datasource_type": ds_type,
+                    "exprs": exprs,
+                }
+            )
+    return index
+
+
 def kustomize_build(path: Path) -> str:
     rel = path.relative_to(REPO_ROOT)
     try:
@@ -472,6 +538,11 @@ def main() -> int:
         action="store_true",
         help="Write generated files (otherwise just print a summary).",
     )
+    ap.add_argument(
+        "--sync-comms",
+        action="store_true",
+        help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
+    )
     args = ap.parse_args()
 
     out_dir = REPO_ROOT / args.out
@@ -504,6 +575,7 @@ def main() -> int:
     summary_path = out_dir / "catalog" / "atlas-summary.json"
     diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
     runbooks_json_path = out_dir / "catalog" / "runbooks.json"
+    metrics_json_path = out_dir / "catalog" / "metrics.json"
 
     catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
     catalog_path.write_text(
@@ -517,9 +589,14 @@ def main() -> int:
     diagram_path.write_text(diagram, encoding="utf-8")
 
     # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
-    runbooks_dir = out_dir / "runbooks"
+    runbook_dirs = [
+        out_dir / "runbooks",
+        out_dir / "software",
+    ]
     runbooks: list[dict[str, Any]] = []
-    if runbooks_dir.exists():
+    for runbooks_dir in runbook_dirs:
+        if not runbooks_dir.exists():
+            continue
         for md_file in sorted(runbooks_dir.glob("*.md")):
             raw = md_file.read_text(encoding="utf-8")
             fm: dict[str, Any] = {}
@@ -543,12 +620,22 @@ def main() -> int:
                 }
             )
     runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
+    metrics_index = _extract_metrics_index(DASHBOARD_DIR)
+    metrics_json_path.write_text(
+        json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
+    )
 
     print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
+    print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
+
+    if args.sync_comms:
+        comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
+        _sync_tree(out_dir, comms_dir)
+        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
     return 0
 
 
diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml
index fa35440..bf012c0 100644
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@@ -20,8 +20,9 @@ spec:
       labels:
         app: ollama
       annotations:
-        ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
-        ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
+        ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
+        ai.bstein.dev/gpu: GPU pool (titan-22/24)
+        ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
     spec:
       affinity:
         nodeAffinity:
@@ -31,8 +32,6 @@ spec:
                   - key: kubernetes.io/hostname
                     operator: In
                     values:
-                      - titan-20
-                      - titan-21
                       - titan-22
                       - titan-24
       runtimeClassName: nvidia
@@ -53,7 +52,7 @@ spec:
             - name: OLLAMA_MODELS
               value: /root/.ollama
             - name: OLLAMA_MODEL
-              value: qwen2.5-coder:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct-q4_0
           command:
             - /bin/sh
             - -c
@@ -68,8 +67,8 @@ spec:
               mountPath: /root/.ollama
           resources:
             requests:
-              cpu: 250m
-              memory: 1Gi
+              cpu: 500m
+              memory: 2Gi
               nvidia.com/gpu.shared: 1
             limits:
               nvidia.com/gpu.shared: 1
@@ -96,10 +95,10 @@ spec:
               mountPath: /root/.ollama
           resources:
             requests:
-              cpu: "2"
-              memory: 8Gi
+              cpu: "4"
+              memory: 16Gi
               nvidia.com/gpu.shared: 1
             limits:
-              cpu: "4"
-              memory: 12Gi
+              cpu: "8"
+              memory: 24Gi
               nvidia.com/gpu.shared: 1
diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index 376622c..ba7d6f8 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -28,6 +28,7 @@ spec:
           {{ with secret "kv/data/atlas/shared/chat-ai-keys-runtime" }}
           export CHAT_KEY_MATRIX="{{ .Data.data.matrix }}"
           export CHAT_KEY_HOMEPAGE="{{ .Data.data.homepage }}"
+          export AI_ATLASBOT_TOKEN="{{ .Data.data.homepage }}"
           {{ end }}
           {{ with secret "kv/data/atlas/shared/portal-e2e-client" }}
           export PORTAL_E2E_CLIENT_ID="{{ .Data.data.client_id }}"
@@ -58,14 +59,18 @@ spec:
           args:
             - >-
               . /vault/secrets/portal-env.sh
-              && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app
+              && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app
           env:
             - name: AI_CHAT_API
               value: http://ollama.ai.svc.cluster.local:11434
             - name: AI_CHAT_MODEL
               value: qwen2.5-coder:7b-instruct-q4_0
             - name: AI_CHAT_TIMEOUT_SEC
-              value: "60"
+              value: "480"
+            - name: AI_ATLASBOT_ENDPOINT
+              value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
+            - name: AI_ATLASBOT_TIMEOUT_SEC
+              value: "30"
             - name: AI_NODE_NAME
               valueFrom:
                 fieldRef:
@@ -91,10 +96,28 @@ spec:
               value: atlas
             - name: KEYCLOAK_ADMIN_CLIENT_ID
               value: bstein-dev-home-admin
+            - name: ARIADNE_URL
+              value: http://ariadne.maintenance.svc.cluster.local
+            - name: ARIADNE_TIMEOUT_SEC
+              value: "10"
             - name: ACCOUNT_ALLOWED_GROUPS
               value: ""
             - name: HTTP_CHECK_TIMEOUT_SEC
               value: "2"
+            - name: PORTAL_DB_POOL_MIN
+              value: "0"
+            - name: PORTAL_DB_POOL_MAX
+              value: "5"
+            - name: PORTAL_DB_CONNECT_TIMEOUT_SEC
+              value: "5"
+            - name: PORTAL_DB_LOCK_TIMEOUT_SEC
+              value: "5"
+            - name: PORTAL_DB_STATEMENT_TIMEOUT_SEC
+              value: "30"
+            - name: PORTAL_DB_IDLE_IN_TX_TIMEOUT_SEC
+              value: "10"
+            - name: PORTAL_RUN_MIGRATIONS
+              value: "false"
             - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
               value: "30"
             - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC
diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
index 40d74fe..e572406 100644
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@@ -47,6 +47,8 @@ spec:
           env:
             - name: UPSTREAM_URL
               value: http://bstein-dev-home-backend/api/chat
+            - name: UPSTREAM_TIMEOUT_SEC
+              value: "600"
           ports:
             - name: http
               containerPort: 8080
@@ -65,10 +67,10 @@ spec:
           resources:
             requests:
               cpu: 20m
-              memory: 64Mi
+              memory: 128Mi
             limits:
               cpu: 200m
-              memory: 256Mi
+              memory: 512Mi
           volumeMounts:
             - name: code
               mountPath: /app/gateway.py
diff --git a/services/bstein-dev-home/image.yaml b/services/bstein-dev-home/image.yaml
index 3b6c757..eed2736 100644
--- a/services/bstein-dev-home/image.yaml
+++ b/services/bstein-dev-home/image.yaml
@@ -7,6 +7,8 @@ metadata:
 spec:
   image: registry.bstein.dev/bstein/bstein-dev-home-frontend
   interval: 1m0s
+  secretRef:
+    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
@@ -28,6 +30,8 @@ metadata:
 spec:
   image: registry.bstein.dev/bstein/bstein-dev-home-backend
   interval: 1m0s
+  secretRef:
+    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f9d3c87..f62fb17 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -16,13 +16,13 @@ resources:
   - backend-deployment.yaml
   - backend-service.yaml
   - vaultwarden-cred-sync-cronjob.yaml
-  - portal-onboarding-e2e-test-job.yaml
+  - oneoffs/portal-onboarding-e2e-test-job.yaml
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-102 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
+    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-103 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home
diff --git a/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
new file mode 100644
index 0000000..1d1dfc8
--- /dev/null
+++ b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
@@ -0,0 +1,6 @@
+# services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: bstein-dev-home
+resources:
+  - portal-migrate-job.yaml
diff --git a/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
new file mode 100644
index 0000000..1f7e092
--- /dev/null
+++ b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
@@ -0,0 +1,48 @@
+# services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
+# One-off job for bstein-dev-home/bstein-dev-home-portal-migrate-36.
+# Purpose: bstein dev home portal migrate 36 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bstein-dev-home-portal-migrate-36
+  namespace: bstein-dev-home
+  annotations:
+    kustomize.toolkit.fluxcd.io/force: "true"
+spec:
+  suspend: true
+  backoffLimit: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      labels:
+        app: bstein-dev-home-portal-migrate
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/role: "bstein-dev-home"
+        vault.hashicorp.com/agent-inject-secret-portal-env.sh: "kv/data/atlas/portal/atlas-portal-db"
+        vault.hashicorp.com/agent-inject-template-portal-env.sh: |
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ end }}
+    spec:
+      serviceAccountName: bstein-dev-home
+      restartPolicy: Never
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      imagePullSecrets:
+        - name: harbor-regcred
+      containers:
+        - name: migrate
+          image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-95
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-c"]
+          args:
+            - >-
+              . /vault/secrets/portal-env.sh
+              && exec python -m atlas_portal.migrate
+          env:
+            - name: PORTAL_RUN_MIGRATIONS
+              value: "true"
diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
similarity index 88%
rename from services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
rename to services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
index f22272e..9923499 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
@@ -1,10 +1,15 @@
-# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+# services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
+# One-off job for bstein-dev-home/portal-onboarding-e2e-test-27.
+# Purpose: portal onboarding e2e test 27 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-19
+  name: portal-onboarding-e2e-test-27
   namespace: bstein-dev-home
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py
index 3ca2fa1..19d3606 100644
--- a/services/bstein-dev-home/scripts/gateway.py
+++ b/services/bstein-dev-home/scripts/gateway.py
@@ -6,6 +6,7 @@ from urllib import request, error
 UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
 KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
 KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
+UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90"))
 
 ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
 
@@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler):
                 headers={"Content-Type": "application/json"},
                 method="POST",
             )
-            with request.urlopen(upstream_req, timeout=90) as resp:
+            with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp:
                 data = resp.read()
                 self.send_response(resp.status)
                 for k, v in resp.headers.items():
diff --git a/services/bstein-dev-home/secretproviderclass.yaml b/services/bstein-dev-home/secretproviderclass.yaml
index f330fe6..2fa714a 100644
--- a/services/bstein-dev-home/secretproviderclass.yaml
+++ b/services/bstein-dev-home/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "bstein-dev-home"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/bstein-dev-home"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
index 29141fe..acd851b 100644
--- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
+++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 4618053..b65aef0 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-4
+        checksum/atlasbot-configmap: manual-atlasbot-101
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -73,12 +73,33 @@ spec:
               value: /kb
             - name: VM_URL
               value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
+            - name: ARIADNE_STATE_URL
+              value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
             - name: BOT_USER
               value: atlasbot
+            - name: BOT_MENTIONS
+              value: atlasbot,aatlasbot,atlas_quick,atlas_smart
             - name: OLLAMA_URL
-              value: https://chat.ai.bstein.dev/
+              value: http://ollama.ai.svc.cluster.local:11434
             - name: OLLAMA_MODEL
-              value: qwen2.5-coder:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct
+            - name: ATLASBOT_MODEL_FAST
+              value: qwen2.5:14b-instruct-q4_0
+            - name: ATLASBOT_MODEL_DEEP
+              value: qwen2.5:14b-instruct
+            - name: OLLAMA_FALLBACK_MODEL
+              value: qwen2.5:14b-instruct-q4_0
+            - name: OLLAMA_TIMEOUT_SEC
+              value: "600"
+            - name: ATLASBOT_THINKING_INTERVAL_SEC
+              value: "120"
+            - name: ATLASBOT_SNAPSHOT_TTL_SEC
+              value: "30"
+            - name: ATLASBOT_HTTP_PORT
+              value: "8090"
+          ports:
+            - name: http
+              containerPort: 8090
           resources:
             requests:
               cpu: 100m
@@ -110,6 +131,8 @@ spec:
                 path: catalog/atlas.json
               - key: atlas-summary.json
                 path: catalog/atlas-summary.json
+              - key: metrics.json
+                path: catalog/metrics.json
               - key: runbooks.json
                 path: catalog/runbooks.json
               - key: atlas-http.mmd
diff --git a/services/comms/atlasbot-service.yaml b/services/comms/atlasbot-service.yaml
new file mode 100644
index 0000000..c8b3570
--- /dev/null
+++ b/services/comms/atlasbot-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: atlasbot
+  namespace: comms
+  labels:
+    app: atlasbot
+spec:
+  selector:
+    app: atlasbot
+  ports:
+    - name: http
+      port: 8090
+      targetPort: 8090
+  type: ClusterIP
diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml
index 21a8af5..3eae2dd 100644
--- a/services/comms/guest-name-job.yaml
+++ b/services/comms/guest-name-job.yaml
@@ -8,7 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/1 * * * *"
-  suspend: false
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 1
diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml
index 4456348..eeac49e 100644
--- a/services/comms/helmrelease.yaml
+++ b/services/comms/helmrelease.yaml
@@ -140,6 +140,7 @@ spec:
       autocreate_auto_join_rooms: true
       default_room_version: "11"
       experimental_features:
+        msc4108_enabled: true
         msc3266_enabled: true
         msc4143_enabled: true
         msc4222_enabled: true
diff --git a/services/comms/knowledge/catalog/atlas-summary.json b/services/comms/knowledge/catalog/atlas-summary.json
index fa35051..ea825ce 100644
--- a/services/comms/knowledge/catalog/atlas-summary.json
+++ b/services/comms/knowledge/catalog/atlas-summary.json
@@ -1,8 +1,8 @@
 {
   "counts": {
-    "helmrelease_host_hints": 17,
-    "http_endpoints": 37,
-    "services": 43,
-    "workloads": 54
+    "helmrelease_host_hints": 19,
+    "http_endpoints": 45,
+    "services": 47,
+    "workloads": 74
   }
 }
diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json
index 0d97bcd..951c807 100644
--- a/services/comms/knowledge/catalog/atlas.json
+++ b/services/comms/knowledge/catalog/atlas.json
@@ -11,6 +11,21 @@
       "path": "services/bstein-dev-home",
       "targetNamespace": "bstein-dev-home"
     },
+    {
+      "name": "bstein-dev-home-migrations",
+      "path": "services/bstein-dev-home/migrations",
+      "targetNamespace": "bstein-dev-home"
+    },
+    {
+      "name": "cert-manager",
+      "path": "infrastructure/cert-manager",
+      "targetNamespace": "cert-manager"
+    },
+    {
+      "name": "cert-manager-cleanup",
+      "path": "infrastructure/cert-manager/cleanup",
+      "targetNamespace": "cert-manager"
+    },
     {
       "name": "comms",
       "path": "services/comms",
@@ -26,6 +41,11 @@
       "path": "services/crypto",
       "targetNamespace": "crypto"
     },
+    {
+      "name": "finance",
+      "path": "services/finance",
+      "targetNamespace": "finance"
+    },
     {
       "name": "flux-system",
       "path": "clusters/atlas/flux-system",
@@ -46,6 +66,11 @@
       "path": "services/harbor",
       "targetNamespace": "harbor"
     },
+    {
+      "name": "health",
+      "path": "services/health",
+      "targetNamespace": "health"
+    },
     {
       "name": "helm",
       "path": "infrastructure/sources/helm",
@@ -71,6 +96,16 @@
       "path": "services/logging",
       "targetNamespace": null
     },
+    {
+      "name": "longhorn",
+      "path": "infrastructure/longhorn/core",
+      "targetNamespace": "longhorn-system"
+    },
+    {
+      "name": "longhorn-adopt",
+      "path": "infrastructure/longhorn/adopt",
+      "targetNamespace": "longhorn-system"
+    },
     {
       "name": "longhorn-ui",
       "path": "infrastructure/longhorn/ui-ingress",
@@ -161,11 +196,21 @@
       "path": "infrastructure/vault-csi",
       "targetNamespace": "kube-system"
     },
+    {
+      "name": "vault-injector",
+      "path": "infrastructure/vault-injector",
+      "targetNamespace": "vault"
+    },
     {
       "name": "vaultwarden",
       "path": "services/vaultwarden",
       "targetNamespace": "vaultwarden"
     },
+    {
+      "name": "wallet-monero-temp",
+      "path": "services/crypto/wallet-monero-temp",
+      "targetNamespace": "crypto"
+    },
     {
       "name": "xmr-miner",
       "path": "services/crypto/xmr-miner",
@@ -199,7 +244,7 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157"
       ]
     },
     {
@@ -215,7 +260,20 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "bstein-dev-home",
+      "name": "bstein-dev-home-vault-sync",
+      "labels": {
+        "app": "bstein-dev-home-vault-sync"
+      },
+      "serviceAccountName": "bstein-dev-home-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -225,7 +283,7 @@
       "labels": {
         "app": "chat-ai-gateway"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "bstein-dev-home",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -249,6 +307,19 @@
         "python:3.11-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "comms",
+      "name": "comms-vault-sync",
+      "labels": {
+        "app": "comms-vault-sync"
+      },
+      "serviceAccountName": "comms-vault",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "comms",
@@ -256,7 +327,7 @@
       "labels": {
         "app": "coturn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -286,7 +357,7 @@
       "labels": {
         "app": "livekit"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -301,12 +372,12 @@
       "labels": {
         "app": "livekit-token-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
       "images": [
-        "ghcr.io/element-hq/lk-jwt-service:0.3.0"
+        "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0"
       ]
     },
     {
@@ -316,7 +387,7 @@
       "labels": {
         "app": "matrix-authentication-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -331,7 +402,7 @@
       "labels": {
         "app.kubernetes.io/name": "matrix-guest-register"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {},
       "images": [
         "python:3.11-slim"
@@ -365,6 +436,19 @@
         "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "crypto-vault-sync",
+      "labels": {
+        "app": "crypto-vault-sync"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "crypto",
@@ -372,7 +456,7 @@
       "labels": {
         "app": "monero-p2pool"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "crypto-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -395,6 +479,53 @@
         "registry.bstein.dev/crypto/monerod:0.18.4.1"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "labels": {
+        "app": "wallet-monero-temp"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "actual-budget",
+      "labels": {
+        "app": "actual-budget"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "firefly",
+      "labels": {
+        "app": "firefly"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "fireflyiii/core:version-6.4.15"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "flux-system",
@@ -516,7 +647,7 @@
       "labels": {
         "app": "gitea"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "gitea-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -524,6 +655,36 @@
         "gitea/gitea:1.23"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "harbor",
+      "name": "harbor-vault-sync",
+      "labels": {
+        "app": "harbor-vault-sync"
+      },
+      "serviceAccountName": "harbor-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "health",
+      "name": "wger",
+      "labels": {
+        "app": "wger"
+      },
+      "serviceAccountName": "health-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10",
+        "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "jellyfin",
@@ -531,7 +692,7 @@
       "labels": {
         "app": "jellyfin"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {},
       "images": [
         "docker.io/jellyfin/jellyfin:10.11.5"
@@ -544,14 +705,27 @@
       "labels": {
         "app": "pegasus"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
         "alpine:3.20",
-        "registry.bstein.dev/streaming/pegasus:1.2.32"
+        "registry.bstein.dev/streaming/pegasus-vault:1.2.32"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "jellyfin",
+      "name": "pegasus-vault-sync",
+      "labels": {
+        "app": "pegasus-vault-sync"
+      },
+      "serviceAccountName": "pegasus-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -570,6 +744,35 @@
         "jenkins/jenkins:2.528.3-jdk21"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "jenkins",
+      "name": "jenkins-vault-sync",
+      "labels": {
+        "app": "jenkins-vault-sync"
+      },
+      "serviceAccountName": "jenkins-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "kube-system",
+      "name": "ntp-sync",
+      "labels": {
+        "app": "ntp-sync"
+      },
+      "serviceAccountName": null,
+      "nodeSelector": {},
+      "images": [
+        "public.ecr.aws/docker/library/busybox:1.36.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "kube-system",
@@ -636,6 +839,21 @@
         "hashicorp/vault-csi-provider:1.7.0"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "kube-system",
+      "name": "coredns",
+      "labels": {
+        "k8s-app": "kube-dns"
+      },
+      "serviceAccountName": "coredns",
+      "nodeSelector": {
+        "kubernetes.io/os": "linux"
+      },
+      "images": [
+        "registry.bstein.dev/infra/coredns:1.12.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "logging",
@@ -681,6 +899,19 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "logging",
+      "name": "logging-vault-sync",
+      "labels": {
+        "app": "logging-vault-sync"
+      },
+      "serviceAccountName": "logging-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "logging",
@@ -688,12 +919,27 @@
       "labels": {
         "app": "oauth2-proxy-logs"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "logging-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "longhorn-system",
+      "name": "longhorn-vault-sync",
+      "labels": {
+        "app": "longhorn-vault-sync"
+      },
+      "serviceAccountName": "longhorn-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -703,7 +949,7 @@
       "labels": {
         "app": "oauth2-proxy-longhorn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "longhorn-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -729,14 +975,45 @@
     {
       "kind": "Deployment",
       "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "name": "mailu-vault-sync",
       "labels": {
-        "app": "mailu-sync-listener"
+        "app": "mailu-vault-sync"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "mailu-vault-sync",
       "nodeSelector": {},
       "images": [
-        "python:3.11-alpine"
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "disable-k3s-traefik",
+      "labels": {
+        "app": "disable-k3s-traefik"
+      },
+      "serviceAccountName": "disable-k3s-traefik",
+      "nodeSelector": {
+        "node-role.kubernetes.io/control-plane": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "k3s-agent-restart",
+      "labels": {
+        "app": "k3s-agent-restart"
+      },
+      "serviceAccountName": "node-nofile",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
     {
@@ -767,6 +1044,35 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "ariadne",
+      "labels": {
+        "app": "ariadne"
+      },
+      "serviceAccountName": "ariadne",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/bstein/ariadne:0.1.0-49"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "maintenance-vault-sync",
+      "labels": {
+        "app": "maintenance-vault-sync"
+      },
+      "serviceAccountName": "maintenance-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "monitoring",
@@ -795,6 +1101,19 @@
         "python:3.10-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "monitoring",
+      "name": "monitoring-vault-sync",
+      "labels": {
+        "app": "monitoring-vault-sync"
+      },
+      "serviceAccountName": "monitoring-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "monitoring",
@@ -802,7 +1121,7 @@
       "labels": {
         "app": "postmark-exporter"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "monitoring-vault-sync",
       "nodeSelector": {},
       "images": [
         "python:3.12-alpine"
@@ -830,7 +1149,7 @@
       "labels": {
         "app": "nextcloud"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "nextcloud-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -845,7 +1164,7 @@
       "labels": {
         "app": "outline"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "outline-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -875,7 +1194,7 @@
       "labels": {
         "app": "planka"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "planka-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -895,7 +1214,8 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "postgres:15"
+        "postgres:15",
+        "quay.io/prometheuscommunity/postgres-exporter:v0.15.0"
       ]
     },
     {
@@ -905,8 +1225,11 @@
       "labels": {
         "app": "keycloak"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "sso-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
         "quay.io/keycloak/keycloak:26.0.7"
       ]
@@ -918,12 +1241,25 @@
       "labels": {
         "app": "oauth2-proxy"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "sso",
+      "name": "sso-vault-sync",
+      "labels": {
+        "app": "sso-vault-sync"
+      },
+      "serviceAccountName": "sso-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -933,7 +1269,7 @@
       "labels": {
         "app": "openldap"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -951,7 +1287,7 @@
       },
       "serviceAccountName": "sui-metrics",
       "nodeSelector": {
-        "kubernetes.io/hostname": "titan-24"
+        "hardware": "rpi5"
       },
       "images": [
         "victoriametrics/vmagent:v1.103.0"
@@ -962,7 +1298,9 @@
       "namespace": "traefik",
       "name": "traefik",
       "labels": {
-        "app": "traefik"
+        "app": "traefik",
+        "app.kubernetes.io/instance": "traefik-kube-system",
+        "app.kubernetes.io/name": "traefik"
       },
       "serviceAccountName": "traefik-ingress-controller",
       "nodeSelector": {
@@ -995,10 +1333,13 @@
       "labels": {
         "app": "vaultwarden"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "vaultwarden-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
-        "vaultwarden/server:1.33.2"
+        "vaultwarden/server:1.35.2"
       ]
     }
   ],
@@ -1565,6 +1906,54 @@
         }
       ]
     },
+    {
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wallet-monero-temp"
+      },
+      "ports": [
+        {
+          "name": "rpc",
+          "port": 18083,
+          "targetPort": 18083,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "actual-budget",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "actual-budget"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 5006,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "firefly",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "firefly"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 8080,
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "flux-system",
       "name": "notification-controller",
@@ -1632,7 +2021,7 @@
     {
       "namespace": "gitea",
       "name": "gitea-ssh",
-      "type": "NodePort",
+      "type": "LoadBalancer",
       "selector": {
         "app": "gitea"
       },
@@ -1645,6 +2034,22 @@
         }
       ]
     },
+    {
+      "namespace": "health",
+      "name": "wger",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wger"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": "http",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "jellyfin",
       "name": "jellyfin",
@@ -1699,29 +2104,6 @@
         }
       ]
     },
-    {
-      "namespace": "kube-system",
-      "name": "traefik",
-      "type": "LoadBalancer",
-      "selector": {
-        "app.kubernetes.io/instance": "traefik-kube-system",
-        "app.kubernetes.io/name": "traefik"
-      },
-      "ports": [
-        {
-          "name": "web",
-          "port": 80,
-          "targetPort": "web",
-          "protocol": "TCP"
-        },
-        {
-          "name": "websecure",
-          "port": 443,
-          "targetPort": "websecure",
-          "protocol": "TCP"
-        }
-      ]
-    },
     {
       "namespace": "logging",
       "name": "oauth2-proxy-logs",
@@ -1803,17 +2185,17 @@
       ]
     },
     {
-      "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "namespace": "maintenance",
+      "name": "ariadne",
       "type": "ClusterIP",
       "selector": {
-        "app": "mailu-sync-listener"
+        "app": "ariadne"
       },
       "ports": [
         {
           "name": "http",
-          "port": 8080,
-          "targetPort": 8080,
+          "port": 80,
+          "targetPort": "http",
           "protocol": "TCP"
         }
       ]
@@ -1959,6 +2341,12 @@
           "port": 5432,
           "targetPort": 5432,
           "protocol": "TCP"
+        },
+        {
+          "name": "metrics",
+          "port": 9187,
+          "targetPort": 9187,
+          "protocol": "TCP"
         }
       ]
     },
@@ -2032,6 +2420,28 @@
         }
       ]
     },
+    {
+      "namespace": "traefik",
+      "name": "traefik",
+      "type": "LoadBalancer",
+      "selector": {
+        "app": "traefik"
+      },
+      "ports": [
+        {
+          "name": "web",
+          "port": 80,
+          "targetPort": "web",
+          "protocol": "TCP"
+        },
+        {
+          "name": "websecure",
+          "port": 443,
+          "targetPort": "websecure",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "traefik",
       "name": "traefik-metrics",
@@ -2210,6 +2620,26 @@
         "source": "bstein-dev-home"
       }
     },
+    {
+      "host": "budget.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "actual-budget",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "actual-budget"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "actual-budget",
+        "source": "finance"
+      }
+    },
     {
       "host": "call.live.bstein.dev",
       "path": "/",
@@ -2290,6 +2720,26 @@
         "source": "nextcloud"
       }
     },
+    {
+      "host": "health.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "health",
+        "service": "wger",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "wger"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "wger",
+        "source": "health"
+      }
+    },
     {
       "host": "kit.live.bstein.dev",
       "path": "/livekit/jwt",
@@ -2385,6 +2835,106 @@
         "source": "comms"
       }
     },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/r0/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/login",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/logout",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/refresh",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
     {
       "host": "logs.bstein.dev",
       "path": "/",
@@ -2650,6 +3200,26 @@
         "source": "monerod"
       }
     },
+    {
+      "host": "money.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "firefly",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "firefly"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "firefly",
+        "source": "finance"
+      }
+    },
     {
       "host": "notes.bstein.dev",
       "path": "/",
@@ -2838,7 +3408,6 @@
       "matrix.live.bstein.dev"
     ],
     "comms:comms/othrys-synapse": [
-      "bstein.dev",
       "kit.live.bstein.dev",
       "live.bstein.dev",
       "matrix.live.bstein.dev",
@@ -2853,6 +3422,9 @@
     "logging:logging/data-prepper": [
       "registry.bstein.dev"
     ],
+    "longhorn:longhorn-system/longhorn": [
+      "registry.bstein.dev"
+    ],
     "mailu:mailu-mailserver/mailu": [
       "bstein.dev",
       "mail.bstein.dev"
@@ -2862,8 +3434,12 @@
     ],
     "monitoring:monitoring/grafana": [
       "bstein.dev",
+      "mail.bstein.dev",
       "metrics.bstein.dev",
       "sso.bstein.dev"
+    ],
+    "monitoring:monitoring/kube-state-metrics": [
+      "atlas.bstein.dev"
     ]
   }
 }
diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml
index 6529e1a..637b5f9 100644
--- a/services/comms/knowledge/catalog/atlas.yaml
+++ b/services/comms/knowledge/catalog/atlas.yaml
@@ -1,4 +1,4 @@
-# services/comms/knowledge/catalog/atlas.yaml
+# knowledge/catalog/atlas.yaml
 # Generated by scripts/knowledge_render_atlas.py (do not edit by hand)
 cluster: atlas
 sources:
@@ -8,6 +8,15 @@ sources:
 - name: bstein-dev-home
   path: services/bstein-dev-home
   targetNamespace: bstein-dev-home
+- name: bstein-dev-home-migrations
+  path: services/bstein-dev-home/migrations
+  targetNamespace: bstein-dev-home
+- name: cert-manager
+  path: infrastructure/cert-manager
+  targetNamespace: cert-manager
+- name: cert-manager-cleanup
+  path: infrastructure/cert-manager/cleanup
+  targetNamespace: cert-manager
 - name: comms
   path: services/comms
   targetNamespace: comms
@@ -17,6 +26,9 @@ sources:
 - name: crypto
   path: services/crypto
   targetNamespace: crypto
+- name: finance
+  path: services/finance
+  targetNamespace: finance
 - name: flux-system
   path: clusters/atlas/flux-system
   targetNamespace: null
@@ -29,6 +41,9 @@ sources:
 - name: harbor
   path: services/harbor
   targetNamespace: harbor
+- name: health
+  path: services/health
+  targetNamespace: health
 - name: helm
   path: infrastructure/sources/helm
   targetNamespace: flux-system
@@ -44,6 +59,12 @@ sources:
 - name: logging
   path: services/logging
   targetNamespace: null
+- name: longhorn
+  path: infrastructure/longhorn/core
+  targetNamespace: longhorn-system
+- name: longhorn-adopt
+  path: infrastructure/longhorn/adopt
+  targetNamespace: longhorn-system
 - name: longhorn-ui
   path: infrastructure/longhorn/ui-ingress
   targetNamespace: longhorn-system
@@ -98,9 +119,15 @@ sources:
 - name: vault-csi
   path: infrastructure/vault-csi
   targetNamespace: kube-system
+- name: vault-injector
+  path: infrastructure/vault-injector
+  targetNamespace: vault
 - name: vaultwarden
   path: services/vaultwarden
   targetNamespace: vaultwarden
+- name: wallet-monero-temp
+  path: services/crypto/wallet-monero-temp
+  targetNamespace: crypto
 - name: xmr-miner
   path: services/crypto/xmr-miner
   targetNamespace: crypto
@@ -124,7 +151,7 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157
 - kind: Deployment
   namespace: bstein-dev-home
   name: bstein-dev-home-frontend
@@ -135,13 +162,22 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157
+- kind: Deployment
+  namespace: bstein-dev-home
+  name: bstein-dev-home-vault-sync
+  labels:
+    app: bstein-dev-home-vault-sync
+  serviceAccountName: bstein-dev-home-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: bstein-dev-home
   name: chat-ai-gateway
   labels:
     app: chat-ai-gateway
-  serviceAccountName: null
+  serviceAccountName: bstein-dev-home
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -157,12 +193,21 @@ workloads:
     hardware: rpi5
   images:
   - python:3.11-slim
+- kind: Deployment
+  namespace: comms
+  name: comms-vault-sync
+  labels:
+    app: comms-vault-sync
+  serviceAccountName: comms-vault
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: comms
   name: coturn
   labels:
     app: coturn
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -182,7 +227,7 @@ workloads:
   name: livekit
   labels:
     app: livekit
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -192,17 +237,17 @@ workloads:
   name: livekit-token-service
   labels:
     app: livekit-token-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
-  - ghcr.io/element-hq/lk-jwt-service:0.3.0
+  - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0
 - kind: Deployment
   namespace: comms
   name: matrix-authentication-service
   labels:
     app: matrix-authentication-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -212,7 +257,7 @@ workloads:
   name: matrix-guest-register
   labels:
     app.kubernetes.io/name: matrix-guest-register
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector: {}
   images:
   - python:3.11-slim
@@ -235,12 +280,21 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
+- kind: Deployment
+  namespace: crypto
+  name: crypto-vault-sync
+  labels:
+    app: crypto-vault-sync
+  serviceAccountName: crypto-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: crypto
   name: monero-p2pool
   labels:
     app: monero-p2pool
-  serviceAccountName: null
+  serviceAccountName: crypto-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -255,6 +309,38 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - registry.bstein.dev/crypto/monerod:0.18.4.1
+- kind: Deployment
+  namespace: crypto
+  name: wallet-monero-temp
+  labels:
+    app: wallet-monero-temp
+  serviceAccountName: crypto-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1
+- kind: Deployment
+  namespace: finance
+  name: actual-budget
+  labels:
+    app: actual-budget
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d
+- kind: Deployment
+  namespace: finance
+  name: firefly
+  labels:
+    app: firefly
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - fireflyiii/core:version-6.4.15
 - kind: Deployment
   namespace: flux-system
   name: helm-controller
@@ -344,17 +430,38 @@ workloads:
   name: gitea
   labels:
     app: gitea
-  serviceAccountName: null
+  serviceAccountName: gitea-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
   - gitea/gitea:1.23
+- kind: Deployment
+  namespace: harbor
+  name: harbor-vault-sync
+  labels:
+    app: harbor-vault-sync
+  serviceAccountName: harbor-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
+- kind: Deployment
+  namespace: health
+  name: wger
+  labels:
+    app: wger
+  serviceAccountName: health-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10
+  - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5
 - kind: Deployment
   namespace: jellyfin
   name: jellyfin
   labels:
     app: jellyfin
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector: {}
   images:
   - docker.io/jellyfin/jellyfin:10.11.5
@@ -363,13 +470,22 @@ workloads:
   name: pegasus
   labels:
     app: pegasus
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
   - alpine:3.20
-  - registry.bstein.dev/streaming/pegasus:1.2.32
+  - registry.bstein.dev/streaming/pegasus-vault:1.2.32
+- kind: Deployment
+  namespace: jellyfin
+  name: pegasus-vault-sync
+  labels:
+    app: pegasus-vault-sync
+  serviceAccountName: pegasus-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: jenkins
   name: jenkins
@@ -381,6 +497,26 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - jenkins/jenkins:2.528.3-jdk21
+- kind: Deployment
+  namespace: jenkins
+  name: jenkins-vault-sync
+  labels:
+    app: jenkins-vault-sync
+  serviceAccountName: jenkins-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: kube-system
+  name: ntp-sync
+  labels:
+    app: ntp-sync
+  serviceAccountName: null
+  nodeSelector: {}
+  images:
+  - public.ecr.aws/docker/library/busybox:1.36.1
 - kind: DaemonSet
   namespace: kube-system
   name: nvidia-device-plugin-jetson
@@ -427,6 +563,16 @@ workloads:
     kubernetes.io/os: linux
   images:
   - hashicorp/vault-csi-provider:1.7.0
+- kind: Deployment
+  namespace: kube-system
+  name: coredns
+  labels:
+    k8s-app: kube-dns
+  serviceAccountName: coredns
+  nodeSelector:
+    kubernetes.io/os: linux
+  images:
+  - registry.bstein.dev/infra/coredns:1.12.1
 - kind: DaemonSet
   namespace: logging
   name: node-image-gc-rpi4
@@ -457,22 +603,41 @@ workloads:
     hardware: rpi5
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: logging
+  name: logging-vault-sync
+  labels:
+    app: logging-vault-sync
+  serviceAccountName: logging-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: logging
   name: oauth2-proxy-logs
   labels:
     app: oauth2-proxy-logs
-  serviceAccountName: null
+  serviceAccountName: logging-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: longhorn-system
+  name: longhorn-vault-sync
+  labels:
+    app: longhorn-vault-sync
+  serviceAccountName: longhorn-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: longhorn-system
   name: oauth2-proxy-longhorn
   labels:
     app: oauth2-proxy-longhorn
-  serviceAccountName: null
+  serviceAccountName: longhorn-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -489,13 +654,34 @@ workloads:
   - registry.bstein.dev/bstein/kubectl:1.35.0
 - kind: Deployment
   namespace: mailu-mailserver
-  name: mailu-sync-listener
+  name: mailu-vault-sync
   labels:
-    app: mailu-sync-listener
-  serviceAccountName: null
+    app: mailu-vault-sync
+  serviceAccountName: mailu-vault-sync
   nodeSelector: {}
   images:
-  - python:3.11-alpine
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: maintenance
+  name: disable-k3s-traefik
+  labels:
+    app: disable-k3s-traefik
+  serviceAccountName: disable-k3s-traefik
+  nodeSelector:
+    node-role.kubernetes.io/control-plane: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: DaemonSet
+  namespace: maintenance
+  name: k3s-agent-restart
+  labels:
+    app: k3s-agent-restart
+  serviceAccountName: node-nofile
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
 - kind: DaemonSet
   namespace: maintenance
   name: node-image-sweeper
@@ -515,6 +701,26 @@ workloads:
   nodeSelector: {}
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: maintenance
+  name: ariadne
+  labels:
+    app: ariadne
+  serviceAccountName: ariadne
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/bstein/ariadne:0.1.0-49
+- kind: Deployment
+  namespace: maintenance
+  name: maintenance-vault-sync
+  labels:
+    app: maintenance-vault-sync
+  serviceAccountName: maintenance-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: DaemonSet
   namespace: monitoring
   name: dcgm-exporter
@@ -534,12 +740,21 @@ workloads:
     jetson: 'true'
   images:
   - python:3.10-slim
+- kind: Deployment
+  namespace: monitoring
+  name: monitoring-vault-sync
+  labels:
+    app: monitoring-vault-sync
+  serviceAccountName: monitoring-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: monitoring
   name: postmark-exporter
   labels:
     app: postmark-exporter
-  serviceAccountName: null
+  serviceAccountName: monitoring-vault-sync
   nodeSelector: {}
   images:
   - python:3.12-alpine
@@ -558,7 +773,7 @@ workloads:
   name: nextcloud
   labels:
     app: nextcloud
-  serviceAccountName: null
+  serviceAccountName: nextcloud-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -568,7 +783,7 @@ workloads:
   name: outline
   labels:
     app: outline
-  serviceAccountName: null
+  serviceAccountName: outline-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -588,7 +803,7 @@ workloads:
   name: planka
   labels:
     app: planka
-  serviceAccountName: null
+  serviceAccountName: planka-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -603,13 +818,16 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - postgres:15
+  - quay.io/prometheuscommunity/postgres-exporter:v0.15.0
 - kind: Deployment
   namespace: sso
   name: keycloak
   labels:
     app: keycloak
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: sso-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
   - quay.io/keycloak/keycloak:26.0.7
 - kind: Deployment
@@ -617,17 +835,26 @@ workloads:
   name: oauth2-proxy
   labels:
     app: oauth2-proxy
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: sso
+  name: sso-vault-sync
+  labels:
+    app: sso-vault-sync
+  serviceAccountName: sso-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: StatefulSet
   namespace: sso
   name: openldap
   labels:
     app: openldap
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -640,7 +867,7 @@ workloads:
     app: sui-metrics
   serviceAccountName: sui-metrics
   nodeSelector:
-    kubernetes.io/hostname: titan-24
+    hardware: rpi5
   images:
   - victoriametrics/vmagent:v1.103.0
 - kind: Deployment
@@ -648,6 +875,8 @@ workloads:
   name: traefik
   labels:
     app: traefik
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
   serviceAccountName: traefik-ingress-controller
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
@@ -669,10 +898,12 @@ workloads:
   name: vaultwarden
   labels:
     app: vaultwarden
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: vaultwarden-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
-  - vaultwarden/server:1.33.2
+  - vaultwarden/server:1.35.2
 services:
 - namespace: ai
   name: ollama
@@ -1040,6 +1271,36 @@ services:
     port: 3333
     targetPort: 3333
     protocol: TCP
+- namespace: crypto
+  name: wallet-monero-temp
+  type: ClusterIP
+  selector:
+    app: wallet-monero-temp
+  ports:
+  - name: rpc
+    port: 18083
+    targetPort: 18083
+    protocol: TCP
+- namespace: finance
+  name: actual-budget
+  type: ClusterIP
+  selector:
+    app: actual-budget
+  ports:
+  - name: http
+    port: 80
+    targetPort: 5006
+    protocol: TCP
+- namespace: finance
+  name: firefly
+  type: ClusterIP
+  selector:
+    app: firefly
+  ports:
+  - name: http
+    port: 80
+    targetPort: 8080
+    protocol: TCP
 - namespace: flux-system
   name: notification-controller
   type: ClusterIP
@@ -1082,7 +1343,7 @@ services:
     protocol: TCP
 - namespace: gitea
   name: gitea-ssh
-  type: NodePort
+  type: LoadBalancer
   selector:
     app: gitea
   ports:
@@ -1090,6 +1351,16 @@ services:
     port: 2242
     targetPort: 2242
     protocol: TCP
+- namespace: health
+  name: wger
+  type: ClusterIP
+  selector:
+    app: wger
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
+    protocol: TCP
 - namespace: jellyfin
   name: jellyfin
   type: ClusterIP
@@ -1124,21 +1395,6 @@ services:
     port: 50000
     targetPort: 50000
     protocol: TCP
-- namespace: kube-system
-  name: traefik
-  type: LoadBalancer
-  selector:
-    app.kubernetes.io/instance: traefik-kube-system
-    app.kubernetes.io/name: traefik
-  ports:
-  - name: web
-    port: 80
-    targetPort: web
-    protocol: TCP
-  - name: websecure
-    port: 443
-    targetPort: websecure
-    protocol: TCP
 - namespace: logging
   name: oauth2-proxy-logs
   type: ClusterIP
@@ -1191,15 +1447,15 @@ services:
     port: 4190
     targetPort: 4190
     protocol: TCP
-- namespace: mailu-mailserver
-  name: mailu-sync-listener
+- namespace: maintenance
+  name: ariadne
   type: ClusterIP
   selector:
-    app: mailu-sync-listener
+    app: ariadne
   ports:
   - name: http
-    port: 8080
-    targetPort: 8080
+    port: 80
+    targetPort: http
     protocol: TCP
 - namespace: monitoring
   name: dcgm-exporter
@@ -1291,6 +1547,10 @@ services:
     port: 5432
     targetPort: 5432
     protocol: TCP
+  - name: metrics
+    port: 9187
+    targetPort: 9187
+    protocol: TCP
 - namespace: sso
   name: keycloak
   type: ClusterIP
@@ -1335,6 +1595,20 @@ services:
     port: 8429
     targetPort: 8429
     protocol: TCP
+- namespace: traefik
+  name: traefik
+  type: LoadBalancer
+  selector:
+    app: traefik
+  ports:
+  - name: web
+    port: 80
+    targetPort: web
+    protocol: TCP
+  - name: websecure
+    port: 443
+    targetPort: websecure
+    protocol: TCP
 - namespace: traefik
   name: traefik-metrics
   type: ClusterIP
@@ -1447,6 +1721,19 @@ http_endpoints:
     kind: Ingress
     name: bstein-dev-home
     source: bstein-dev-home
+- host: budget.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: actual-budget
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: actual-budget
+  via:
+    kind: Ingress
+    name: actual-budget
+    source: finance
 - host: call.live.bstein.dev
   path: /
   backend:
@@ -1499,6 +1786,19 @@ http_endpoints:
     kind: Ingress
     name: nextcloud
     source: nextcloud
+- host: health.bstein.dev
+  path: /
+  backend:
+    namespace: health
+    service: wger
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: wger
+  via:
+    kind: Ingress
+    name: wger
+    source: health
 - host: kit.live.bstein.dev
   path: /livekit/jwt
   backend:
@@ -1558,6 +1858,65 @@ http_endpoints:
     kind: Ingress
     name: matrix-routing
     source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/r0/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: &id003
+    - kind: Deployment
+      name: matrix-guest-register
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/login
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: &id002
+    - kind: Deployment
+      name: matrix-authentication-service
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/logout
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/refresh
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: *id003
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
 - host: logs.bstein.dev
   path: /
   backend:
@@ -1601,9 +1960,7 @@ http_endpoints:
     namespace: comms
     service: matrix-authentication-service
     port: 8080
-    workloads: &id002
-    - kind: Deployment
-      name: matrix-authentication-service
+    workloads: *id002
   via:
     kind: Ingress
     name: matrix-routing
@@ -1647,9 +2004,7 @@ http_endpoints:
     namespace: comms
     service: matrix-guest-register
     port: 8080
-    workloads: &id003
-    - kind: Deployment
-      name: matrix-guest-register
+    workloads: *id003
   via:
     kind: Ingress
     name: matrix-routing
@@ -1722,6 +2077,19 @@ http_endpoints:
     kind: Ingress
     name: monerod
     source: monerod
+- host: money.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: firefly
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: firefly
+  via:
+    kind: Ingress
+    name: firefly
+    source: finance
 - host: notes.bstein.dev
   path: /
   backend:
@@ -1845,7 +2213,6 @@ helmrelease_host_hints:
   - live.bstein.dev
   - matrix.live.bstein.dev
   comms:comms/othrys-synapse:
-  - bstein.dev
   - kit.live.bstein.dev
   - live.bstein.dev
   - matrix.live.bstein.dev
@@ -1856,6 +2223,8 @@ helmrelease_host_hints:
   - registry.bstein.dev
   logging:logging/data-prepper:
   - registry.bstein.dev
+  longhorn:longhorn-system/longhorn:
+  - registry.bstein.dev
   mailu:mailu-mailserver/mailu:
   - bstein.dev
   - mail.bstein.dev
@@ -1863,5 +2232,8 @@ helmrelease_host_hints:
   - alerts.bstein.dev
   monitoring:monitoring/grafana:
   - bstein.dev
+  - mail.bstein.dev
   - metrics.bstein.dev
   - sso.bstein.dev
+  monitoring:monitoring/kube-state-metrics:
+  - atlas.bstein.dev
diff --git a/services/comms/knowledge/catalog/metrics.json b/services/comms/knowledge/catalog/metrics.json
new file mode 100644
index 0000000..e929db5
--- /dev/null
+++ b/services/comms/knowledge/catalog/metrics.json
@@ -0,0 +1,1880 @@
+[
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 1,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Namespace",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Node",
+    "panel_id": 3,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Top Pods by GPU Util",
+    "panel_id": 4,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (range)",
+    "panel_id": 1,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 3,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Stale (>36h)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Missing Success",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Suspended",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (24h)",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Runs (1h)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Error (hours ago)",
+    "panel_id": 10,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Success (hours ago)",
+    "panel_id": 11,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Success (hours ago)",
+    "panel_id": 12,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Schedule (hours ago)",
+    "panel_id": 13,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 14,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (30d)",
+    "panel_id": 15,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Access Requests",
+    "panel_id": 16,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(ariadne_access_requests_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Coverage (%)",
+    "panel_id": 17,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_coverage_percent{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Tests (latest)",
+    "panel_id": 18,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_tests_total{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (7d)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"7d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Success Rate (1d)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Limit Used (30d)",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Send Limit (30d)",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Last Success",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_last_success_timestamp_seconds)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounce Rate (1d vs 7d)",
+    "panel_id": 13,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounce_rate)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounced (1d vs 7d)",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounced)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d vs 7d)",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_sent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Success Rate (5m)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (1h)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (6h)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Edge P99 Latency (ms)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Traffic",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Egress Traffic",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Intra-Cluster Traffic",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Per-Node Throughput",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Namespaces",
+    "panel_id": 9,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Pods",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Routers (req/s)",
+    "panel_id": 11,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Entrypoints (req/s)",
+    "panel_id": 12,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Worker Nodes Ready",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server 5xx rate",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server P99 latency",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "etcd P99 latency",
+    "panel_id": 11,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node CPU",
+    "panel_id": 4,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node RAM",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) CPU",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) RAM",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Stuck Terminating",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Atlas Availability",
+    "panel_id": 27,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Problem Pods",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Workers Ready",
+    "panel_id": 1,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: CPU",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: RAM",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: NET (rx+tx)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: I/O (r+w)",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Sent (1d)",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Success Rate (1d)",
+    "panel_id": 32,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Limit Used (30d)",
+    "panel_id": 33,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Connections Used",
+    "panel_id": 34,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Hottest Connections",
+    "panel_id": 35,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(1, sum by (datname) (pg_stat_activity_count))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Usage",
+    "panel_id": 23,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Usage",
+    "panel_id": 24,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Free",
+    "panel_id": 25,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Free",
+    "panel_id": 26,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 40,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 41,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Test Success Rate",
+    "panel_id": 42,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Tests with Failures (24h)",
+    "panel_id": 43,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace CPU Share",
+    "panel_id": 11,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 12,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace RAM Share",
+    "panel_id": 13,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node CPU",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node RAM",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane CPU",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane RAM",
+    "panel_id": 17,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Node Pod Share",
+    "panel_id": 28,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 29,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Ingress Throughput",
+    "panel_id": 18,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Egress Throughput",
+    "panel_id": 19,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Intra-Cluster Throughput",
+    "panel_id": 20,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 21,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Nodes Closest to Full Root Disks",
+    "panel_id": 22,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Problem Pods",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Stuck Terminating (>10m)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Pods Not Running",
+    "panel_id": 5,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Terminating >10m",
+    "panel_id": 7,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Node Pod Share",
+    "panel_id": 8,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 9,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Namespace Plurality by Node v27",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Free",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Free",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Per-Node Usage",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Per-Node Usage",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage History",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage History",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Sweepers Ready",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Cron Freshness (s)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})"
+    ]
+  }
+]
diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json
index d7356ca..960510d 100644
--- a/services/comms/knowledge/catalog/runbooks.json
+++ b/services/comms/knowledge/catalog/runbooks.json
@@ -20,6 +20,22 @@
     ],
     "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured."
   },
+  {
+    "path": "runbooks/comms-verify.md",
+    "title": "Othrys verification checklist",
+    "tags": [
+      "comms",
+      "matrix",
+      "element",
+      "livekit"
+    ],
+    "entrypoints": [
+      "https://live.bstein.dev",
+      "https://matrix.live.bstein.dev"
+    ],
+    "source_paths": [],
+    "body": "1) Guest join:\n- Open a private window and visit:\n  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN."
+  },
   {
     "path": "runbooks/kb-authoring.md",
     "title": "KB authoring: what to write (and what not to)",
@@ -69,5 +85,13 @@
       "clusters/atlas/<...>"
     ],
     "body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
+  },
+  {
+    "path": "software/metis.md",
+    "title": "metis",
+    "tags": [],
+    "entrypoints": [],
+    "source_paths": [],
+    "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2"
   }
 ]
diff --git a/services/comms/knowledge/diagrams/atlas-http.mmd b/services/comms/knowledge/diagrams/atlas-http.mmd
index ab7c362..1aa7ac8 100644
--- a/services/comms/knowledge/diagrams/atlas-http.mmd
+++ b/services/comms/knowledge/diagrams/atlas-http.mmd
@@ -17,6 +17,11 @@ flowchart LR
   host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
   wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
   svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
+  host_budget_bstein_dev["budget.bstein.dev"]
+  svc_finance_actual_budget["finance/actual-budget (Service)"]
+  host_budget_bstein_dev --> svc_finance_actual_budget
+  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
+  svc_finance_actual_budget --> wl_finance_actual_budget
   host_call_live_bstein_dev["call.live.bstein.dev"]
   svc_comms_element_call["comms/element-call (Service)"]
   host_call_live_bstein_dev --> svc_comms_element_call
@@ -37,6 +42,11 @@ flowchart LR
   host_cloud_bstein_dev --> svc_nextcloud_nextcloud
   wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
   svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
+  host_health_bstein_dev["health.bstein.dev"]
+  svc_health_wger["health/wger (Service)"]
+  host_health_bstein_dev --> svc_health_wger
+  wl_health_wger["health/wger (Deployment)"]
+  svc_health_wger --> wl_health_wger
   host_kit_live_bstein_dev["kit.live.bstein.dev"]
   svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
   host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@@ -50,6 +60,14 @@ flowchart LR
   host_live_bstein_dev --> svc_comms_matrix_wellknown
   svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
   host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_logs_bstein_dev["logs.bstein.dev"]
   svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
   host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
@@ -64,21 +82,20 @@ flowchart LR
   svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
   host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
   host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
-  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
-  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
-  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
   host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
-  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
-  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
   host_monero_bstein_dev["monero.bstein.dev"]
   svc_crypto_monerod["crypto/monerod (Service)"]
   host_monero_bstein_dev --> svc_crypto_monerod
   wl_crypto_monerod["crypto/monerod (Deployment)"]
   svc_crypto_monerod --> wl_crypto_monerod
+  host_money_bstein_dev["money.bstein.dev"]
+  svc_finance_firefly["finance/firefly (Service)"]
+  host_money_bstein_dev --> svc_finance_firefly
+  wl_finance_firefly["finance/firefly (Deployment)"]
+  svc_finance_firefly --> wl_finance_firefly
   host_notes_bstein_dev["notes.bstein.dev"]
   svc_outline_outline["outline/outline (Service)"]
   host_notes_bstein_dev --> svc_outline_outline
@@ -143,19 +160,29 @@ flowchart LR
     svc_comms_livekit
     wl_comms_livekit
     svc_comms_othrys_synapse_matrix_synapse
-    svc_comms_matrix_authentication_service
-    wl_comms_matrix_authentication_service
     svc_comms_matrix_guest_register
     wl_comms_matrix_guest_register
+    svc_comms_matrix_authentication_service
+    wl_comms_matrix_authentication_service
   end
   subgraph crypto[crypto]
     svc_crypto_monerod
     wl_crypto_monerod
   end
+  subgraph finance[finance]
+    svc_finance_actual_budget
+    wl_finance_actual_budget
+    svc_finance_firefly
+    wl_finance_firefly
+  end
   subgraph gitea[gitea]
     svc_gitea_gitea
     wl_gitea_gitea
   end
+  subgraph health[health]
+    svc_health_wger
+    wl_health_wger
+  end
   subgraph jellyfin[jellyfin]
     svc_jellyfin_pegasus
     wl_jellyfin_pegasus
diff --git a/services/comms/knowledge/metis.md b/services/comms/knowledge/metis.md
new file mode 100644
index 0000000..5b0d06b
--- /dev/null
+++ b/services/comms/knowledge/metis.md
@@ -0,0 +1,26 @@
+# Metis (node recovery)
+
+## Node classes (current map)
+- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
+- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
+- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
+- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
+- amd64 agents: titan-22/24 (Debian 13, k3s agent)
+- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
+
+## Longhorn disk UUIDs (critical nodes)
+- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
+- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
+- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
+- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
+
+## Metis repo (~/Development/metis)
+- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
+- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
+- `AGENTS.md` in repo is untracked and holds raw notes.
+
+## Next implementation steps
+- Add per-class golden image refs and checksums (Harbor or file://) when ready.
+- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
+- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
+- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
diff --git a/services/comms/knowledge/runbooks/comms-verify.md b/services/comms/knowledge/runbooks/comms-verify.md
new file mode 100644
index 0000000..8c09d0a
--- /dev/null
+++ b/services/comms/knowledge/runbooks/comms-verify.md
@@ -0,0 +1,30 @@
+---
+title: Othrys verification checklist
+tags:
+  - comms
+  - matrix
+  - element
+  - livekit
+entrypoints:
+  - https://live.bstein.dev
+  - https://matrix.live.bstein.dev
+---
+
+1) Guest join:
+- Open a private window and visit:
+  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
+- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
+
+2) Keycloak login:
+- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
+
+3) Video rooms:
+- Start an Element Call room and confirm audio/video with a second account.
+- Check that guests can read public rooms but cannot start calls.
+
+4) Well-known:
+- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
+- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
+
+5) TURN reachability:
+- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.
diff --git a/services/comms/knowledge/software/metis.md b/services/comms/knowledge/software/metis.md
new file mode 100644
index 0000000..7ca3b39
--- /dev/null
+++ b/services/comms/knowledge/software/metis.md
@@ -0,0 +1,73 @@
+# Metis (node recovery)
+
+## Node classes (current map)
+- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
+- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
+- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
+- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
+- amd64 agents: titan-22/24 (Debian 13, k3s agent)
+- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
+
+### Jetson nodes (titan-20/21)
+- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
+- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
+- k3s agent with drop-in 99-nofile.conf.
+
+## Longhorn disk UUIDs (critical nodes)
+- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
+- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
+- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
+- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
+
+## Metis repo (~/Development/metis)
+- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
+- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
+- `AGENTS.md` in repo is untracked and holds raw notes.
+
+## Next implementation steps
+- Add per-class golden image refs and checksums (Harbor or file://) when ready.
+- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
+- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
+- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
+
+## Node OS/Kernel/CRI snapshot (Jan 2026)
+- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
+- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
+
+
+### External hosts
+- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
+- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
+- titan-23/oceanus: TODO audit (future).
+
+
+### Control plane Pis (titan-0a/0b/0c)
+- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
+- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
+- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
+
+
+## k3s versions
+- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
+- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
+- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2
diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml
index 3360067..969ca58 100644
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@@ -14,6 +14,7 @@ resources:
   - guest-register-deployment.yaml
   - guest-register-service.yaml
   - atlasbot-deployment.yaml
+  - atlasbot-service.yaml
   - wellknown.yaml
   - atlasbot-rbac.yaml
   - mas-secrets-ensure-rbac.yaml
@@ -21,23 +22,24 @@ resources:
   - mas-db-ensure-rbac.yaml
   - synapse-signingkey-ensure-rbac.yaml
   - vault-sync-deployment.yaml
-  - mas-admin-client-secret-ensure-job.yaml
-  - mas-db-ensure-job.yaml
-  - comms-secrets-ensure-job.yaml
-  - synapse-signingkey-ensure-job.yaml
-  - synapse-seeder-admin-ensure-job.yaml
-  - synapse-user-seed-job.yaml
-  - mas-local-users-ensure-job.yaml
+  - oneoffs/mas-admin-client-secret-ensure-job.yaml
+  - oneoffs/mas-db-ensure-job.yaml
+  - oneoffs/comms-secrets-ensure-job.yaml
+  - oneoffs/synapse-admin-ensure-job.yaml
+  - oneoffs/synapse-signingkey-ensure-job.yaml
+  - oneoffs/synapse-seeder-admin-ensure-job.yaml
+  - oneoffs/synapse-user-seed-job.yaml
+  - oneoffs/mas-local-users-ensure-job.yaml
   - mas-deployment.yaml
   - livekit-token-deployment.yaml
   - livekit.yaml
   - coturn.yaml
   - seed-othrys-room.yaml
   - guest-name-job.yaml
-  - othrys-kick-numeric-job.yaml
+  - oneoffs/othrys-kick-numeric-job.yaml
   - pin-othrys-job.yaml
   - reset-othrys-room-job.yaml
-  - bstein-force-leave-job.yaml
+  - oneoffs/bstein-force-leave-job.yaml
   - livekit-ingress.yaml
   - livekit-middlewares.yaml
   - matrix-ingress.yaml
@@ -73,5 +75,6 @@ configMapGenerator:
       - INDEX.md=knowledge/INDEX.md
       - atlas.json=knowledge/catalog/atlas.json
       - atlas-summary.json=knowledge/catalog/atlas-summary.json
+      - metrics.json=knowledge/catalog/metrics.json
       - runbooks.json=knowledge/catalog/runbooks.json
       - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
diff --git a/services/comms/mas-configmap.yaml b/services/comms/mas-configmap.yaml
index 5e6cfdd..9d2c11e 100644
--- a/services/comms/mas-configmap.yaml
+++ b/services/comms/mas-configmap.yaml
@@ -72,7 +72,7 @@ data:
               template: "{{ user.name }}"
             email:
               action: force
-              template: "{{ user.email }}"
+              template: "{{ user.mailu_email }}"
 
     policy:
       data:
diff --git a/services/comms/bstein-force-leave-job.yaml b/services/comms/oneoffs/bstein-force-leave-job.yaml
similarity index 96%
rename from services/comms/bstein-force-leave-job.yaml
rename to services/comms/oneoffs/bstein-force-leave-job.yaml
index 0286f8c..7efe826 100644
--- a/services/comms/bstein-force-leave-job.yaml
+++ b/services/comms/oneoffs/bstein-force-leave-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/bstein-force-leave-job.yaml
+# services/comms/oneoffs/bstein-force-leave-job.yaml
+# One-off job for comms/bstein-leave-rooms-12.
+# Purpose: bstein leave rooms 12 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: bstein-leave-rooms-12
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
similarity index 91%
rename from services/comms/comms-secrets-ensure-job.yaml
rename to services/comms/oneoffs/comms-secrets-ensure-job.yaml
index b71dd40..35ca73c 100644
--- a/services/comms/comms-secrets-ensure-job.yaml
+++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/comms-secrets-ensure-job.yaml
+# services/comms/oneoffs/comms-secrets-ensure-job.yaml
+# One-off job for comms/comms-secrets-ensure-7.
+# Purpose: comms secrets ensure 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: comms-secrets-ensure-6
+  name: comms-secrets-ensure-7
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/comms/mas-admin-client-secret-ensure-job.yaml b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
similarity index 90%
rename from services/comms/mas-admin-client-secret-ensure-job.yaml
rename to services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
index 7b05cca..e1d5458 100644
--- a/services/comms/mas-admin-client-secret-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
@@ -1,4 +1,8 @@
-# services/comms/mas-admin-client-secret-ensure-job.yaml
+# services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
+# One-off job for comms/mas-admin-client-secret-writer.
+# Purpose: mas admin client secret writer (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: v1
 kind: ServiceAccount
 metadata:
@@ -41,6 +45,7 @@ metadata:
   name: mas-admin-client-secret-ensure-11
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     spec:
diff --git a/services/comms/mas-db-ensure-job.yaml b/services/comms/oneoffs/mas-db-ensure-job.yaml
similarity index 91%
rename from services/comms/mas-db-ensure-job.yaml
rename to services/comms/oneoffs/mas-db-ensure-job.yaml
index 56707a9..44137da 100644
--- a/services/comms/mas-db-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-db-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/mas-db-ensure-job.yaml
+# services/comms/oneoffs/mas-db-ensure-job.yaml
+# One-off job for comms/mas-db-ensure-22.
+# Purpose: mas db ensure 22 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: mas-db-ensure-22
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 600
   template:
diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
similarity index 96%
rename from services/comms/mas-local-users-ensure-job.yaml
rename to services/comms/oneoffs/mas-local-users-ensure-job.yaml
index 5802009..7b51072 100644
--- a/services/comms/mas-local-users-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/mas-local-users-ensure-job.yaml
+# services/comms/oneoffs/mas-local-users-ensure-job.yaml
+# One-off job for comms/mas-local-users-ensure-18.
+# Purpose: mas local users ensure 18 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-15
+  name: mas-local-users-ensure-18
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/oneoffs/othrys-kick-numeric-job.yaml
similarity index 96%
rename from services/comms/othrys-kick-numeric-job.yaml
rename to services/comms/oneoffs/othrys-kick-numeric-job.yaml
index 0d3914a..e38a6bb 100644
--- a/services/comms/othrys-kick-numeric-job.yaml
+++ b/services/comms/oneoffs/othrys-kick-numeric-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/othrys-kick-numeric-job.yaml
+# services/comms/oneoffs/othrys-kick-numeric-job.yaml
+# One-off job for comms/othrys-kick-numeric-8.
+# Purpose: othrys kick numeric 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: othrys-kick-numeric-8
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/comms/oneoffs/synapse-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
new file mode 100644
index 0000000..95bc9f2
--- /dev/null
+++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
@@ -0,0 +1,219 @@
+# services/comms/oneoffs/synapse-admin-ensure-job.yaml
+# One-off job for comms/synapse-admin-ensure-3.
+# Purpose: synapse admin ensure 3 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: synapse-admin-ensure-3
+  namespace: comms
+spec:
+  suspend: true
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 3600
+  template:
+    spec:
+      serviceAccountName: comms-secrets-ensure
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["arm64"]
+      containers:
+        - name: ensure
+          image: python:3.11-slim
+          env:
+            - name: VAULT_ADDR
+              value: http://vault.vault.svc.cluster.local:8200
+            - name: VAULT_ROLE
+              value: comms-secrets
+            - name: SYNAPSE_ADMIN_URL
+              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -euo pipefail
+              pip install --no-cache-dir psycopg2-binary bcrypt
+              python - <<'PY'
+              import json
+              import os
+              import secrets
+              import string
+              import time
+              import urllib.error
+              import urllib.request
+
+              import bcrypt
+              import psycopg2
+
+              VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
+              VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
+              SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+              PGHOST = "postgres-service.postgres.svc.cluster.local"
+              PGPORT = 5432
+              PGDATABASE = "synapse"
+              PGUSER = "synapse"
+
+              def log(msg: str) -> None:
+                  print(msg, flush=True)
+
+              def request_json(url: str, payload: dict | None = None) -> dict:
+                  data = None
+                  headers = {"Content-Type": "application/json"}
+                  if payload is not None:
+                      data = json.dumps(payload).encode("utf-8")
+                  req = urllib.request.Request(url, data=data, headers=headers, method="POST" if data else "GET")
+                  with urllib.request.urlopen(req, timeout=30) as resp:
+                      return json.loads(resp.read().decode("utf-8"))
+
+              def vault_login() -> str:
+                  with open(SA_TOKEN_PATH, "r", encoding="utf-8") as f:
+                      jwt = f.read().strip()
+                  payload = {"jwt": jwt, "role": VAULT_ROLE}
+                  resp = request_json(f"{VAULT_ADDR}/v1/auth/kubernetes/login", payload)
+                  token = resp.get("auth", {}).get("client_token")
+                  if not token:
+                      raise RuntimeError("vault login failed")
+                  return token
+
+              def vault_get(token: str, path: str) -> dict:
+                  req = urllib.request.Request(
+                      f"{VAULT_ADDR}/v1/kv/data/atlas/{path}",
+                      headers={"X-Vault-Token": token},
+                  )
+                  try:
+                      with urllib.request.urlopen(req, timeout=30) as resp:
+                          payload = json.loads(resp.read().decode("utf-8"))
+                          return payload.get("data", {}).get("data", {})
+                  except urllib.error.HTTPError as exc:
+                      if exc.code == 404:
+                          return {}
+                      raise
+
+              def vault_put(token: str, path: str, data: dict) -> None:
+                  payload = {"data": data}
+                  req = urllib.request.Request(
+                      f"{VAULT_ADDR}/v1/kv/data/atlas/{path}",
+                      data=json.dumps(payload).encode("utf-8"),
+                      headers={"X-Vault-Token": token, "Content-Type": "application/json"},
+                      method="POST",
+                  )
+                  with urllib.request.urlopen(req, timeout=30) as resp:
+                      resp.read()
+
+              def random_password(length: int = 32) -> str:
+                  alphabet = string.ascii_letters + string.digits
+                  return "".join(secrets.choice(alphabet) for _ in range(length))
+
+              def ensure_admin_creds(token: str) -> dict:
+                  data = vault_get(token, "comms/synapse-admin")
+                  username = (data.get("username") or "").strip() or "synapse-admin"
+                  password = (data.get("password") or "").strip()
+                  if not password:
+                      password = random_password()
+                  data["username"] = username
+                  data["password"] = password
+                  vault_put(token, "comms/synapse-admin", data)
+                  return data
+
+              def ensure_user(cur, cols, user_id, password, admin):
+                  now_ms = int(time.time() * 1000)
+                  values = {
+                      "name": user_id,
+                      "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
+                      "creation_ts": now_ms,
+                  }
+
+                  def add_flag(name, flag):
+                      if name not in cols:
+                          return
+                      if cols[name]["type"] in ("smallint", "integer"):
+                          values[name] = int(flag)
+                      else:
+                          values[name] = bool(flag)
+
+                  add_flag("admin", admin)
+                  add_flag("deactivated", False)
+                  add_flag("shadow_banned", False)
+                  add_flag("is_guest", False)
+
+                  columns = list(values.keys())
+                  placeholders = ", ".join(["%s"] * len(columns))
+                  updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
+                  query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
+                  cur.execute(query, [values[c] for c in columns])
+
+              def get_cols(cur):
+                  cur.execute(
+                      """
+                      SELECT column_name, is_nullable, column_default, data_type
+                      FROM information_schema.columns
+                      WHERE table_schema = 'public' AND table_name = 'users'
+                      """
+                  )
+                  cols = {}
+                  for name, is_nullable, default, data_type in cur.fetchall():
+                      cols[name] = {
+                          "nullable": is_nullable == "YES",
+                          "default": default,
+                          "type": data_type,
+                      }
+                  return cols
+
+              def ensure_access_token(cur, user_id, token_value):
+                  cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
+                  token_id = cur.fetchone()[0]
+                  cur.execute(
+                      """
+                      INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
+                      VALUES (%s, %s, %s, %s, NULL)
+                      ON CONFLICT (token) DO NOTHING
+                      """,
+                      (token_id, user_id, token_value, "ariadne-admin"),
+                  )
+
+              vault_token = vault_login()
+              admin_data = ensure_admin_creds(vault_token)
+              if admin_data.get("access_token"):
+                  log("synapse admin token already present")
+                  raise SystemExit(0)
+
+              synapse_db = vault_get(vault_token, "comms/synapse-db")
+              pg_password = synapse_db.get("POSTGRES_PASSWORD")
+              if not pg_password:
+                  raise RuntimeError("synapse db password missing")
+
+              user_id = f"@{admin_data['username']}:live.bstein.dev"
+              conn = psycopg2.connect(
+                  host=PGHOST,
+                  port=PGPORT,
+                  dbname=PGDATABASE,
+                  user=PGUSER,
+                  password=pg_password,
+              )
+              token_value = secrets.token_urlsafe(32)
+              try:
+                  with conn:
+                      with conn.cursor() as cur:
+                          cols = get_cols(cur)
+                          ensure_user(cur, cols, user_id, admin_data["password"], True)
+                          ensure_access_token(cur, user_id, token_value)
+              finally:
+                  conn.close()
+
+              admin_data["access_token"] = token_value
+              vault_put(vault_token, "comms/synapse-admin", admin_data)
+              log("synapse admin token stored")
+              PY
diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
similarity index 92%
rename from services/comms/synapse-seeder-admin-ensure-job.yaml
rename to services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
index 9905658..1d8972e 100644
--- a/services/comms/synapse-seeder-admin-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-seeder-admin-ensure-job.yaml
+# services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
+# One-off job for comms/synapse-seeder-admin-ensure-9.
+# Purpose: synapse seeder admin ensure 9 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-seeder-admin-ensure-7
+  name: synapse-seeder-admin-ensure-9
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     metadata:
diff --git a/services/comms/synapse-signingkey-ensure-job.yaml b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
similarity index 88%
rename from services/comms/synapse-signingkey-ensure-job.yaml
rename to services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
index 402a820..bbc4595 100644
--- a/services/comms/synapse-signingkey-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-signingkey-ensure-job.yaml
+# services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
+# One-off job for comms/othrys-synapse-signingkey-ensure-7.
+# Purpose: othrys synapse signingkey ensure 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: othrys-synapse-signingkey-ensure-7
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     spec:
diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/oneoffs/synapse-user-seed-job.yaml
similarity index 95%
rename from services/comms/synapse-user-seed-job.yaml
rename to services/comms/oneoffs/synapse-user-seed-job.yaml
index 7fef796..a732739 100644
--- a/services/comms/synapse-user-seed-job.yaml
+++ b/services/comms/oneoffs/synapse-user-seed-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-user-seed-job.yaml
+# services/comms/oneoffs/synapse-user-seed-job.yaml
+# One-off job for comms/synapse-user-seed-8.
+# Purpose: synapse user seed 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-user-seed-7
+  name: synapse-user-seed-8
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e8bd1a8..be256c0 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3,7 +3,9 @@ import json
 import os
 import re
 import ssl
+import threading
 import time
+from http.server import BaseHTTPRequestHandler, HTTPServer
 from typing import Any
 from urllib import error, parse, request
 
@@ -14,17 +16,31 @@ PASSWORD = os.environ["BOT_PASS"]
 ROOM_ALIAS = "#othrys:live.bstein.dev"
 
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
-MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
+MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
+MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
+MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
+FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
+OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
+ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090"))
+ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "")
+SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30"))
 
 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
+ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
+ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
 
 BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
 SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
 MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
+MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
+MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
+THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
+OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
+OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
 
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE)
 HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)")
@@ -52,11 +68,34 @@ STOPWORDS = {
     "help",
     "atlas",
     "othrys",
+    "system",
+    "systems",
+    "service",
+    "services",
+    "app",
+    "apps",
+    "platform",
+    "software",
+    "tool",
+    "tools",
 }
 
 METRIC_HINT_WORDS = {
+    "bandwidth",
+    "connections",
+    "cpu",
+    "database",
+    "db",
+    "disk",
     "health",
+    "memory",
+    "network",
+    "node",
+    "nodes",
+    "postgres",
     "status",
+    "storage",
+    "usage",
     "down",
     "slow",
     "error",
@@ -69,11 +108,221 @@ METRIC_HINT_WORDS = {
     "pending",
     "unreachable",
     "latency",
+    "pod",
+    "pods",
 }
 
+CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
+TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
+TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
+_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
+CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE)
+
+OPERATION_HINTS = {
+    "count": ("how many", "count", "number", "total"),
+    "list": ("list", "which", "what are", "show", "names"),
+    "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"),
+    "bottom": ("lowest", "least", "minimum", "min", "smallest"),
+    "status": ("ready", "not ready", "unready", "down", "missing", "status"),
+}
+
+METRIC_HINTS = {
+    "cpu": ("cpu",),
+    "ram": ("ram", "memory", "mem"),
+    "net": ("net", "network", "bandwidth", "throughput"),
+    "io": ("io", "disk", "storage"),
+    "connections": ("connections", "conn", "postgres", "database", "db"),
+    "pods": ("pods", "pod"),
+}
+
+CLUSTER_HINT_WORDS = {
+    "atlas",
+    "titan",
+    "cluster",
+    "k8s",
+    "kubernetes",
+    "health",
+    "node",
+    "nodes",
+    "hardware",
+    "architecture",
+    "worker",
+    "workers",
+    "pod",
+    "pods",
+    "namespace",
+    "service",
+    "deployment",
+    "daemonset",
+    "statefulset",
+    "snapshot",
+    "anomaly",
+    "anomalies",
+    "monitor",
+    "monitoring",
+    "runbook",
+    "runbooks",
+    "documentation",
+    "docs",
+    "playbook",
+    "utilization",
+    "usage",
+    "grafana",
+    "victoria",
+    "prometheus",
+    "ariadne",
+    "mailu",
+    "nextcloud",
+    "vaultwarden",
+    "firefly",
+    "wger",
+    "jellyfin",
+    "planka",
+    "budget",
+    "element",
+    "synapse",
+    "mas",
+    "comms",
+    "longhorn",
+    "harbor",
+    "jenkins",
+    "gitea",
+    "flux",
+    "keycloak",
+    "postgres",
+    "database",
+    "db",
+    "atlasbot",
+    "jetson",
+    "rpi",
+    "raspberry",
+    "amd64",
+    "arm64",
+}
+
+_INSIGHT_HINT_WORDS = {
+    "interesting",
+    "unconventional",
+    "surprising",
+    "weird",
+    "odd",
+    "unusual",
+    "outlier",
+    "fun",
+    "cool",
+    "unique",
+    "notable",
+    "coolest",
+    "risk",
+    "risky",
+    "favorite",
+    "favourite",
+    "trivia",
+    "anomaly",
+    "anomalies",
+    "monitor",
+    "monitoring",
+    "alert",
+    "alerts",
+    "stand out",
+    "stands out",
+}
+
+_OVERVIEW_HINT_WORDS = {
+    "overview",
+    "summary",
+    "describe",
+    "explain",
+    "tell me about",
+    "what do you know",
+    "health",
+}
+
+_OLLAMA_LOCK = threading.Lock()
+
+HARDWARE_HINTS = {
+    "amd64": ("amd64", "x86", "x86_64", "x86-64"),
+    "jetson": ("jetson",),
+    "rpi4": ("rpi4", "raspberry pi 4", "raspberry pi-4"),
+    "rpi5": ("rpi5", "raspberry pi 5", "raspberry pi-5"),
+    "rpi": ("rpi", "raspberry"),
+    "arm64": ("arm64", "aarch64"),
+}
+
+def normalize_query(text: str) -> str:
+    cleaned = (text or "").lower()
+    for ch in _DASH_CHARS:
+        cleaned = cleaned.replace(ch, "-")
+    cleaned = cleaned.replace("_", " ")
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned
+
 def _tokens(text: str) -> list[str]:
-    toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
-    return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
+    cleaned = re.sub(r"[\\_/]", " ", text or "")
+    toks = [t.lower() for t in TOKEN_RE.findall(cleaned)]
+    expanded: list[str] = []
+    synonyms = {
+        "network": "net",
+        "net": "network",
+        "memory": "ram",
+        "ram": "memory",
+        "i/o": "io",
+    }
+    for token in toks:
+        expanded.append(token)
+        if "-" in token:
+            expanded.extend(part for part in token.split("-") if part)
+    for token in list(expanded):
+        if token in synonyms:
+            expanded.append(synonyms[token])
+        if token.endswith("s") and len(token) > 3:
+            expanded.append(token.rstrip("s"))
+    return [t for t in expanded if t not in STOPWORDS and len(t) >= 2]
+
+
+def _ensure_confidence(text: str) -> str:
+    if not text:
+        return ""
+    lines = text.strip().splitlines()
+    for idx, line in enumerate(lines):
+        match = CONFIDENCE_RE.search(line)
+        if match:
+            level = match.group(1).lower()
+            lines[idx] = CONFIDENCE_RE.sub(f"Confidence: {level}", line)
+            return "\n".join(lines)
+    lines.append("Confidence: medium")
+    return "\n".join(lines)
+
+
+def _ollama_endpoint() -> str:
+    url = (OLLAMA_URL or "").strip()
+    if not url:
+        return ""
+    if url.endswith("/api/chat"):
+        return url
+    return url.rstrip("/") + "/api/chat"
+
+
+def _history_to_messages(lines: list[str]) -> list[dict[str, str]]:
+    messages: list[dict[str, str]] = []
+    for line in lines:
+        raw = (line or "").strip()
+        if not raw:
+            continue
+        role = "user"
+        content = raw
+        lowered = raw.lower()
+        if lowered.startswith("atlas:"):
+            role = "assistant"
+            content = raw.split(":", 1)[1].strip()
+        elif lowered.startswith("user:"):
+            role = "user"
+            content = raw.split(":", 1)[1].strip()
+        elif ":" in raw:
+            content = raw.split(":", 1)[1].strip()
+        if content:
+            messages.append({"role": role, "content": content})
+    return messages
 
 
 # Mention detection (Matrix rich mentions + plain @atlas).
@@ -97,15 +346,60 @@ def normalize_user_id(token: str) -> str:
 
 MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)}
 
+def _body_mentions_token(body: str) -> bool:
+    lower = (body or "").strip().lower()
+    if not lower:
+        return False
+    for token in MENTION_LOCALPARTS:
+        for prefix in (token, f"@{token}"):
+            if lower.startswith(prefix + ":") or lower.startswith(prefix + ",") or lower.startswith(prefix + " "):
+                return True
+    return False
+
 def is_mentioned(content: dict, body: str) -> bool:
     if MENTION_RE.search(body or "") is not None:
         return True
+    if _body_mentions_token(body or ""):
+        return True
     mentions = content.get("m.mentions", {})
     user_ids = mentions.get("user_ids", [])
     if not isinstance(user_ids, list):
         return False
     return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids)
 
+def _strip_bot_mention(text: str) -> str:
+    if not text:
+        return ""
+    if not MENTION_LOCALPARTS:
+        return text.strip()
+    names = [re.escape(name) for name in MENTION_LOCALPARTS if name]
+    if not names:
+        return text.strip()
+    pattern = r"^(?:\s*@?(?:" + "|".join(names) + r")(?::)?\s+)+"
+    cleaned = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()
+    return cleaned or text.strip()
+
+
+def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
+    lower = normalize_query(body or "")
+    if "atlas_quick" in lower or "atlas-quick" in lower:
+        return "fast"
+    if "atlas_smart" in lower or "atlas-smart" in lower:
+        return "deep"
+    if lower.startswith("quick ") or lower.startswith("fast "):
+        return "fast"
+    if lower.startswith("smart ") or lower.startswith("deep "):
+        return "deep"
+    return default
+
+
+def _model_for_mode(mode: str) -> str:
+    if mode == "fast" and MODEL_FAST:
+        return MODEL_FAST
+    if mode == "deep" and MODEL_DEEP:
+        return MODEL_DEEP
+    return MODEL
+
 
 # Matrix HTTP helper.
 def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
@@ -149,6 +443,8 @@ def send_msg(token: str, room: str, text: str):
 KB = {"catalog": {}, "runbooks": []}
 _HOST_INDEX: dict[str, list[dict]] = {}
 _NAME_INDEX: set[str] = set()
+_METRIC_INDEX: list[dict[str, Any]] = []
+NODE_REGEX = re.compile(r'node=~"([^"]+)"')
 
 def _load_json_file(path: str) -> Any | None:
     try:
@@ -158,11 +454,12 @@ def _load_json_file(path: str) -> Any | None:
         return None
 
 def load_kb():
-    global KB, _HOST_INDEX, _NAME_INDEX
+    global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX
     if not KB_DIR:
         return
     catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {}
     runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or []
+    metrics = _load_json_file(os.path.join(KB_DIR, "catalog", "metrics.json")) or []
     KB = {"catalog": catalog, "runbooks": runbooks}
 
     host_index: dict[str, list[dict]] = collections.defaultdict(list)
@@ -180,15 +477,16 @@ def load_kb():
         if isinstance(w, dict) and w.get("name"):
             names.add(str(w["name"]).lower())
     _NAME_INDEX = names
+    _METRIC_INDEX = metrics if isinstance(metrics, list) else []
 
-def kb_retrieve(query: str, *, limit: int = 3) -> str:
+def _score_kb_docs(query: str) -> list[dict[str, Any]]:
     q = (query or "").strip()
     if not q or not KB.get("runbooks"):
-        return ""
+        return []
     ql = q.lower()
     q_tokens = _tokens(q)
     if not q_tokens:
-        return ""
+        return []
 
     scored: list[tuple[int, dict]] = []
     for doc in KB.get("runbooks", []):
@@ -208,9 +506,16 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
                 score += 4
         if score:
             scored.append((score, doc))
-
     scored.sort(key=lambda x: x[0], reverse=True)
-    picked = [d for _, d in scored[:limit]]
+    return [d for _, d in scored]
+
+
+def kb_retrieve(query: str, *, limit: int = 3) -> str:
+    q = (query or "").strip()
+    if not q:
+        return ""
+    scored = _score_kb_docs(q)
+    picked = scored[:limit]
     if not picked:
         return ""
 
@@ -228,6 +533,1684 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
         used += len(chunk)
     return "\n".join(parts).strip()
 
+
+def kb_retrieve_titles(query: str, *, limit: int = 4) -> str:
+    scored = _score_kb_docs(query)
+    picked = scored[:limit]
+    if not picked:
+        return ""
+    parts = ["Relevant runbooks:"]
+    for doc in picked:
+        title = doc.get("title") or doc.get("path") or "runbook"
+        path = doc.get("path") or ""
+        if path:
+            parts.append(f"- {title} ({path})")
+        else:
+            parts.append(f"- {title}")
+    return "\n".join(parts)
+
+def _extract_titan_nodes(text: str) -> list[str]:
+    cleaned = normalize_query(text)
+    names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n}
+    for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE):
+        tail = match.group(1)
+        for part in re.split(r"[/,]", tail):
+            part = part.strip()
+            if part:
+                names.add(f"titan-{part.lower()}")
+    for match in TITAN_RANGE_RE.finditer(cleaned):
+        left, right = match.groups()
+        if left:
+            names.add(f"titan-{left.lower()}")
+        if right:
+            names.add(f"titan-{right.lower()}")
+    return sorted(names)
+
+def _humanize_rate(value: str, *, unit: str) -> str:
+    try:
+        val = float(value)
+    except (TypeError, ValueError):
+        return value
+    if unit == "%":
+        return f"{val:.1f}%"
+    if val >= 1024 * 1024:
+        return f"{val / (1024 * 1024):.2f} MB/s"
+    if val >= 1024:
+        return f"{val / 1024:.2f} KB/s"
+    return f"{val:.2f} B/s"
+
+def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
+    for phrase in phrases:
+        if " " in phrase:
+            if phrase in text:
+                return True
+        else:
+            if re.search(rf"\b{re.escape(phrase)}\b", text):
+                return True
+    return False
+
+def _detect_operation(q: str) -> str | None:
+    if _has_any(q, OPERATION_HINTS["top"]):
+        return "top"
+    if _has_any(q, OPERATION_HINTS["bottom"]):
+        return "bottom"
+    for op, phrases in OPERATION_HINTS.items():
+        if op in ("top", "bottom"):
+            continue
+        if _has_any(q, phrases):
+            return op
+    return None
+
+def _detect_metric(q: str) -> str | None:
+    q = normalize_query(q)
+    if _has_any(q, ("disk", "storage")):
+        return "io"
+    if _has_any(q, ("io",)) and not _has_any(q, METRIC_HINTS["net"]):
+        return "io"
+    for metric, phrases in METRIC_HINTS.items():
+        if _has_any(q, phrases):
+            return metric
+    tokens = set(_tokens(q))
+    expanded: set[str] = set(tokens)
+    for token in list(tokens):
+        for part in re.split(r"[-_]", token):
+            part = part.strip()
+            if len(part) >= 2:
+                expanded.add(part)
+            if part.endswith("s") and len(part) >= 4:
+                expanded.add(part[:-1])
+    tokens = expanded
+    for metric, phrases in METRIC_HINTS.items():
+        for phrase in phrases:
+            if " " in phrase:
+                if phrase in q:
+                    return metric
+            elif phrase in tokens:
+                return metric
+    return None
+
+def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
+    include: set[str] = set()
+    exclude: set[str] = set()
+    if any(term in q for term in ("gpu", "gpus", "accelerator", "accelerators", "cuda", "nvidia")):
+        include.add("jetson")
+    rpi_specific = any(
+        phrase in q
+        for phrase in (
+            "rpi4",
+            "rpi5",
+            "raspberry pi 4",
+            "raspberry pi 5",
+            "raspberry pi-4",
+            "raspberry pi-5",
+        )
+    )
+    for hardware, phrases in HARDWARE_HINTS.items():
+        if hardware == "rpi" and rpi_specific:
+            continue
+        for phrase in phrases:
+            if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q:
+                exclude.add(hardware)
+            elif phrase in q:
+                include.add(hardware)
+    return include, exclude
+
+
+def _detect_role_filters(q: str) -> set[str]:
+    roles: set[str] = set()
+    if "control-plane" in q or "control plane" in q:
+        roles.add("control-plane")
+    if "master" in q:
+        roles.add("master")
+    if "accelerator" in q:
+        roles.add("accelerator")
+    return roles
+
+def _detect_entity(q: str) -> str | None:
+    if (
+        "node" in q
+        or "nodes" in q
+        or "worker" in q
+        or "hardware" in q
+        or "architecture" in q
+        or "machine" in q
+        or "machines" in q
+        or "host" in q
+        or "hosts" in q
+        or "hostname" in q
+        or "hostnames" in q
+        or TITAN_NODE_RE.search(q)
+    ):
+        return "node"
+    if "pod" in q or "pods" in q:
+        return "pod"
+    if "namespace" in q or "namespaces" in q:
+        return "namespace"
+    return None
+
+def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int:
+    hay = _metric_tokens(entry)
+    score = 0
+    for t in set(tokens):
+        if t in hay:
+            score += 2 if t in (entry.get("panel_title") or "").lower() else 1
+    if metric:
+        for phrase in METRIC_HINTS.get(metric, (metric,)):
+            if phrase in hay:
+                score += 3
+    if op == "top" and ("hottest" in hay or "top" in hay):
+        score += 3
+    if "node" in hay:
+        score += 1
+    return score
+
+def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None:
+    scored: list[tuple[int, dict[str, Any]]] = []
+    for entry in _METRIC_INDEX:
+        if not isinstance(entry, dict):
+            continue
+        score = _metric_entry_score(entry, tokens, metric=metric, op=op)
+        if score:
+            scored.append((score, entry))
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return scored[0][1]
+
+def _apply_node_filter(expr: str, node_regex: str | None) -> str:
+    if not node_regex:
+        return expr
+    needle = 'node_uname_info{nodename!=""}'
+    replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
+    return expr.replace(needle, replacement)
+
+def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool:
+    exprs = entry.get("exprs")
+    expr = exprs[0] if isinstance(exprs, list) and exprs else ""
+    return "* 100" in expr or "*100" in expr
+
+
+def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str:
+    try:
+        num = float(value)
+    except (TypeError, ValueError):
+        return value
+    if percent:
+        return f"{num:.1f}%"
+    if rate:
+        return _humanize_rate(value, unit="rate")
+    if abs(num) >= 1:
+        return f"{num:.2f}".rstrip("0").rstrip(".")
+    return f"{num:.4f}".rstrip("0").rstrip(".")
+
+
+def _format_metric_label(metric: dict[str, Any]) -> str:
+    label_parts = []
+    for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"):
+        if metric.get(k):
+            label_parts.append(f"{k}={metric.get(k)}")
+    if not label_parts:
+        for k in sorted(metric.keys()):
+            if k.startswith("__"):
+                continue
+            label_parts.append(f"{k}={metric.get(k)}")
+            if len(label_parts) >= 4:
+                break
+    return ", ".join(label_parts) if label_parts else "series"
+
+
+def _primary_series_metric(res: dict | None) -> tuple[str | None, str | None]:
+    series = _vm_value_series(res or {})
+    if not series:
+        return (None, None)
+    first = series[0]
+    metric = first.get("metric") if isinstance(first, dict) else {}
+    value = first.get("value") if isinstance(first, dict) else []
+    node = metric.get("node") if isinstance(metric, dict) else None
+    val = value[1] if isinstance(value, list) and len(value) > 1 else None
+    return (node, val)
+
+
+def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
+    series = _vm_value_series(res)
+    panel = entry.get("panel_title") or "Metric"
+    if not series:
+        return ""
+    percent = _metric_expr_uses_percent(entry)
+    lines: list[str] = []
+    for r in series[:5]:
+        if not isinstance(r, dict):
+            continue
+        metric = r.get("metric") or {}
+        value = r.get("value") or []
+        val = value[1] if isinstance(value, list) and len(value) > 1 else ""
+        label = _format_metric_label(metric if isinstance(metric, dict) else {})
+        lines.append(f"{label}: {_format_metric_value(val, percent=percent)}")
+    if not lines:
+        return ""
+    if len(lines) == 1:
+        return f"{panel}: {lines[0]}."
+    return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines)
+
+def _inventory_filter(
+    inventory: list[dict[str, Any]],
+    *,
+    include_hw: set[str],
+    exclude_hw: set[str],
+    only_workers: bool,
+    only_ready: bool | None,
+    nodes_in_query: list[str],
+) -> list[dict[str, Any]]:
+    results = inventory
+    if nodes_in_query:
+        results = [node for node in results if node.get("name") in nodes_in_query]
+    if only_workers:
+        results = [node for node in results if node.get("is_worker") is True]
+    if only_ready is True:
+        results = [node for node in results if node.get("ready") is True]
+    if only_ready is False:
+        results = [node for node in results if node.get("ready") is False]
+    if include_hw:
+        results = [node for node in results if _hardware_match(node, include_hw)]
+    if exclude_hw:
+        results = [node for node in results if not _hardware_match(node, exclude_hw)]
+    return results
+
+def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool:
+    hw = node.get("hardware") or ""
+    arch = node.get("arch") or ""
+    for f in filters:
+        if f == "rpi" and hw in ("rpi4", "rpi5", "rpi"):
+            return True
+        if f == "arm64" and arch == "arm64":
+            return True
+        if hw == f:
+            return True
+        if f == "amd64" and arch == "amd64":
+            return True
+    return False
+
+def _node_roles(labels: dict[str, Any]) -> list[str]:
+    roles: list[str] = []
+    for key in labels.keys():
+        if key.startswith("node-role.kubernetes.io/"):
+            role = key.split("/", 1)[-1]
+            if role:
+                roles.append(role)
+    return sorted(set(roles))
+
+def _hardware_class(labels: dict[str, Any]) -> str:
+    if str(labels.get("jetson") or "").lower() == "true":
+        return "jetson"
+    hardware = (labels.get("hardware") or "").strip().lower()
+    if hardware in ("rpi4", "rpi5", "rpi"):
+        return hardware
+    arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or ""
+    if arch == "amd64":
+        return "amd64"
+    if arch == "arm64":
+        return "arm64-unknown"
+    return "unknown"
+
+def node_inventory_live() -> list[dict[str, Any]]:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return []
+    items = data.get("items") or []
+    inventory: list[dict[str, Any]] = []
+    for node in items if isinstance(items, list) else []:
+        meta = node.get("metadata") or {}
+        labels = meta.get("labels") or {}
+        name = meta.get("name") or ""
+        if not name:
+            continue
+        inventory.append(
+            {
+                "name": name,
+                "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
+                "hardware": _hardware_class(labels),
+                "roles": _node_roles(labels),
+                "is_worker": _node_is_worker(node),
+                "ready": _node_ready_status(node),
+            }
+        )
+    return sorted(inventory, key=lambda item: item["name"])
+
+
+def node_inventory() -> list[dict[str, Any]]:
+    snapshot = _snapshot_state()
+    inventory = _snapshot_inventory(snapshot)
+    if inventory:
+        return inventory
+    return node_inventory_live()
+
+def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
+    grouped: dict[str, list[str]] = collections.defaultdict(list)
+    for node in inventory:
+        grouped[node.get("hardware") or "unknown"].append(node["name"])
+    return {k: sorted(v) for k, v in grouped.items()}
+
+def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str:
+    q = normalize_query(query)
+    if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
+        return ""
+    if inventory is None:
+        inventory = node_inventory()
+    if not inventory:
+        return ""
+    groups = _group_nodes(inventory)
+    total = len(inventory)
+    ready = sum(1 for node in inventory if node.get("ready") is True)
+    not_ready = sum(1 for node in inventory if node.get("ready") is False)
+    lines: list[str] = [
+        "Node inventory (live):",
+        f"- total: {total}, ready: {ready}, not ready: {not_ready}",
+    ]
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        if key in groups:
+            lines.append(f"- {key}: {', '.join(groups[key])}")
+    non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
+    if non_rpi:
+        lines.append(f"- non_raspberry_pi (derived): {', '.join(non_rpi)}")
+    unknowns = groups.get("arm64-unknown", []) + groups.get("unknown", [])
+    if unknowns:
+        lines.append("- note: nodes labeled arm64-unknown/unknown may still be Raspberry Pi unless tagged.")
+    expected_workers = expected_worker_nodes_from_metrics()
+    if expected_workers:
+        ready_workers, not_ready_workers = worker_nodes_status()
+        missing = sorted(set(expected_workers) - set(ready_workers + not_ready_workers))
+        lines.append(f"- expected_workers (grafana): {', '.join(expected_workers)}")
+        lines.append(f"- workers_ready: {', '.join(ready_workers)}")
+        if not_ready_workers:
+            lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
+        if missing:
+            lines.append(f"- workers_missing (derived): {', '.join(missing)}")
+    return "\n".join(lines)
+
+def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
+    q = normalize_query(prompt)
+    if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
+        return node_inventory()
+    return []
+
+def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
+    grouped: dict[str, list[str]] = collections.defaultdict(list)
+    for node in inventory:
+        grouped[(node.get("arch") or "unknown")].append(node["name"])
+    return {k: sorted(v) for k, v in grouped.items()}
+
+def _node_usage_table(metrics: dict[str, Any], *, allowed_nodes: set[str] | None = None) -> list[dict[str, Any]]:
+    usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
+    per_node: dict[str, dict[str, Any]] = {}
+    for metric_name, entries in usage.items() if isinstance(usage, dict) else []:
+        if not isinstance(entries, list):
+            continue
+        for entry in entries:
+            if not isinstance(entry, dict):
+                continue
+            node = entry.get("node")
+            if not isinstance(node, str) or not node:
+                continue
+            if allowed_nodes and node not in allowed_nodes:
+                continue
+            per_node.setdefault(node, {})[metric_name] = entry.get("value")
+    return [{"node": node, **vals} for node, vals in sorted(per_node.items())]
+
+def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]:
+    extremes: dict[str, tuple[str, float]] = {}
+    for metric in ("cpu", "ram", "net", "io"):
+        values: list[tuple[str, float]] = []
+        for entry in usage_table:
+            node = entry.get("node")
+            raw = entry.get(metric)
+            if not node or raw is None:
+                continue
+            try:
+                value = float(raw)
+            except (TypeError, ValueError):
+                continue
+            values.append((node, value))
+        if not values:
+            continue
+        lowest = min(values, key=lambda item: item[1])
+        highest = max(values, key=lambda item: item[1])
+        extremes[f"min_{metric}"] = lowest
+        extremes[f"max_{metric}"] = highest
+    return extremes
+
+def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]:
+    cleaned: list[dict[str, Any]] = []
+    for entry in workloads:
+        if not isinstance(entry, dict):
+            continue
+        cleaned.append(
+            {
+                "namespace": entry.get("namespace"),
+                "workload": entry.get("workload"),
+                "pods_total": entry.get("pods_total"),
+                "pods_running": entry.get("pods_running"),
+                "primary_node": entry.get("primary_node"),
+                "nodes": entry.get("nodes"),
+            }
+        )
+    cleaned.sort(
+        key=lambda item: (
+            -(item.get("pods_total") or 0),
+            str(item.get("namespace") or ""),
+            str(item.get("workload") or ""),
+        )
+    )
+    return cleaned[:limit]
+
+def _workloads_for_prompt(prompt: str, workloads: list[dict[str, Any]], limit: int = 12) -> list[dict[str, Any]]:
+    tokens = set(_tokens(prompt))
+    if tokens:
+        matched: list[dict[str, Any]] = []
+        for entry in workloads:
+            if not isinstance(entry, dict):
+                continue
+            entry_tokens = _workload_tokens(entry)
+            if entry_tokens & tokens:
+                matched.append(entry)
+        if matched:
+            return _workloads_for_facts(matched, limit=limit)
+    return _workloads_for_facts(workloads, limit=limit)
+
+def facts_context(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]] | None,
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> str:
+    inv = inventory or []
+    nodes_in_query = _extract_titan_nodes(prompt)
+    metrics = _snapshot_metrics(snapshot)
+    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
+    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
+    expected_workers = expected_worker_nodes_from_metrics()
+    ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], [])
+    total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
+    ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
+    not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
+    not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names")
+    by_hardware = _group_nodes(inv) if inv else {}
+    by_arch = _nodes_by_arch(inv) if inv else {}
+    control_plane_nodes = [
+        node["name"]
+        for node in inv
+        if any(role in ("control-plane", "master") for role in (node.get("roles") or []))
+    ]
+    worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True]
+
+    lines: list[str] = ["Facts (live snapshot):"]
+    if total is not None:
+        lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
+    if isinstance(summary, dict):
+        by_arch_counts = summary.get("by_arch")
+        if isinstance(by_arch_counts, dict) and by_arch_counts:
+            parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())]
+            lines.append(f"- nodes_by_arch: {', '.join(parts)}")
+    if not_ready_names:
+        lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        nodes_list = by_hardware.get(key) or []
+        if nodes_list:
+            lines.append(f"- {key}: {', '.join(nodes_list)}")
+    if by_hardware:
+        counts = {key: len(nodes_list) for key, nodes_list in by_hardware.items() if nodes_list}
+        if counts:
+            parts = [f"{key}={count}" for key, count in sorted(counts.items())]
+            lines.append(f"- nodes_by_hardware_count: {', '.join(parts)}")
+    non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", [])))
+    if non_rpi:
+        lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}")
+    for key, nodes_list in sorted(by_arch.items()):
+        if nodes_list:
+            lines.append(f"- arch {key}: {', '.join(nodes_list)}")
+    if control_plane_nodes:
+        lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}")
+        control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list)
+        for node in inv:
+            if node.get("name") in control_plane_nodes:
+                control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"])
+        parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())]
+        if parts:
+            lines.append(f"- control_plane_by_hardware: {', '.join(parts)}")
+    if worker_nodes:
+        lines.append(f"- worker_nodes: {', '.join(worker_nodes)}")
+    if ready_workers or not_ready_workers:
+        lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
+        if not_ready_workers:
+            lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
+    if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")):
+        missing = sorted(
+            set(expected_workers)
+            - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
+        )
+        lines.append(f"- expected_workers: {', '.join(expected_workers)}")
+        if missing:
+            lines.append(f"- expected_workers_missing: {', '.join(missing)}")
+
+    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+    usage_metrics = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
+    for key in ("cpu", "ram", "net", "io"):
+        entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
+        node = entry.get("node")
+        value = entry.get("value")
+        if not node or value is None:
+            usage = usage_metrics.get(key) if isinstance(usage_metrics.get(key), list) else []
+            pick = _node_usage_top(usage, allowed_nodes=None)
+            if pick:
+                node, value = pick
+        if node and value is not None:
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=key in ("cpu", "ram"),
+                rate=key in ("net", "io"),
+            )
+            lines.append(f"- hottest_{key}: {node} ({value_fmt})")
+
+    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+    if isinstance(postgres, dict) and postgres:
+        used = postgres.get("used")
+        max_conn = postgres.get("max")
+        if used is not None and max_conn is not None:
+            lines.append(f"- postgres_connections: {used} used / {max_conn} max")
+        hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
+        if hottest_db.get("label"):
+            lines.append(
+                f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
+            )
+
+    for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"):
+        value = metrics.get(key)
+        if value is not None:
+            lines.append(f"- {key}: {value}")
+    if workloads:
+        ns_counts: dict[str, int] = collections.defaultdict(int)
+        for entry in workloads:
+            if not isinstance(entry, dict):
+                continue
+            ns = entry.get("namespace") or ""
+            pods = entry.get("pods_running")
+            if pods is None:
+                pods = entry.get("pods_total")
+            try:
+                pods_val = int(pods)
+            except (TypeError, ValueError):
+                pods_val = 0
+            if ns:
+                ns_counts[ns] += pods_val
+        if ns_counts:
+            top_ns = sorted(ns_counts.items(), key=lambda item: item[1], reverse=True)[:5]
+            parts = [f"{ns}={count}" for ns, count in top_ns]
+            lines.append(f"- pods_by_namespace: {', '.join(parts)}")
+
+    top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else []
+    if top_restarts:
+        items = []
+        for entry in top_restarts[:5]:
+            if not isinstance(entry, dict):
+                continue
+            metric = entry.get("metric") or {}
+            pod = metric.get("pod") or metric.get("name") or ""
+            ns = metric.get("namespace") or ""
+            value = entry.get("value")
+            label = f"{ns}/{pod}".strip("/")
+            if label and value is not None:
+                items.append(f"{label}={value}")
+        if items:
+            lines.append(f"- top_restarts_1h: {', '.join(items)}")
+
+    allowed_nodes = {node.get("name") for node in inv if isinstance(node, dict) and node.get("name")}
+    usage_table = _node_usage_table(metrics, allowed_nodes=allowed_nodes or None)
+    if usage_table:
+        lines.append("- node_usage (cpu/ram/net/io):")
+        for entry in usage_table:
+            node = entry.get("node")
+            if not node:
+                continue
+            cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
+            ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
+            net = (
+                _format_metric_value(str(entry.get("net")), percent=False, rate=True)
+                if entry.get("net") is not None
+                else ""
+            )
+            io_val = (
+                _format_metric_value(str(entry.get("io")), percent=False, rate=True)
+                if entry.get("io") is not None
+                else ""
+            )
+            lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
+        extremes = _usage_extremes(usage_table)
+        for metric in ("cpu", "ram", "net", "io"):
+            min_key = f"min_{metric}"
+            if min_key not in extremes:
+                continue
+            node, value = extremes[min_key]
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=metric in ("cpu", "ram"),
+                rate=metric in ("net", "io"),
+            )
+            lines.append(f"- lowest_{metric}: {node} ({value_fmt})")
+        for metric in ("cpu", "ram"):
+            hottest_parts: list[str] = []
+            lowest_parts: list[str] = []
+            for hw, nodes_list in sorted(by_hardware.items()):
+                entries = []
+                for entry in usage_table:
+                    node = entry.get("node")
+                    if node in nodes_list and entry.get(metric) is not None:
+                        try:
+                            value = float(entry.get(metric))
+                        except (TypeError, ValueError):
+                            continue
+                        entries.append((node, value))
+                if not entries:
+                    continue
+                max_node, max_val = max(entries, key=lambda item: item[1])
+                min_node, min_val = min(entries, key=lambda item: item[1])
+                hottest_parts.append(
+                    f"{hw}={max_node} ({_format_metric_value(str(max_val), percent=True)})"
+                )
+                lowest_parts.append(
+                    f"{hw}={min_node} ({_format_metric_value(str(min_val), percent=True)})"
+                )
+            if hottest_parts:
+                lines.append(f"- hottest_{metric}_by_hardware: {', '.join(hottest_parts)}")
+            if lowest_parts:
+                lines.append(f"- lowest_{metric}_by_hardware: {', '.join(lowest_parts)}")
+
+    if nodes_in_query:
+        lines.append("- node_details:")
+        for name in nodes_in_query:
+            detail = next((n for n in inv if n.get("name") == name), None)
+            if not detail:
+                lines.append(f"  - {name}: not found in snapshot")
+                continue
+            roles = ",".join(detail.get("roles") or []) or "none"
+            lines.append(
+                f"  - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, "
+                f"ready={detail.get('ready')}, roles={roles}"
+            )
+
+    workload_entries = _workloads_for_prompt(prompt, workloads or [])
+    if workload_entries:
+        lines.append("- workloads:")
+        for entry in workload_entries:
+            if not isinstance(entry, dict):
+                continue
+            ns = entry.get("namespace") or ""
+            wl = entry.get("workload") or ""
+            primary = entry.get("primary_node") or ""
+            pods_total = entry.get("pods_total")
+            pods_running = entry.get("pods_running")
+            label = f"{ns}/{wl}" if ns and wl else (wl or ns)
+            if not label:
+                continue
+            if primary:
+                lines.append(
+                    f"  - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}"
+                )
+            else:
+                lines.append(f"  - {label}: pods_total={pods_total}, pods_running={pods_running}")
+        top = max(
+            (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))),
+            key=lambda item: item.get("pods_total", 0),
+            default=None,
+        )
+        if isinstance(top, dict) and top.get("pods_total") is not None:
+            label = f"{top.get('namespace')}/{top.get('workload')}".strip("/")
+            lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})")
+        zero_running = [
+            entry
+            for entry in workload_entries
+            if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0
+        ]
+        if zero_running:
+            labels = []
+            for entry in zero_running:
+                label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/")
+                if label:
+                    labels.append(label)
+            if labels:
+                lines.append(f"- workloads_zero_running: {', '.join(labels)}")
+
+    rendered = "\n".join(lines)
+    return rendered[:MAX_FACTS_CHARS]
+
+def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
+    names = [node["name"] for node in inventory]
+    ready = [node["name"] for node in inventory if node.get("ready") is True]
+    not_ready = [node["name"] for node in inventory if node.get("ready") is False]
+    groups = _group_nodes(inventory)
+    workers = [node for node in inventory if node.get("is_worker") is True]
+    worker_names = [node["name"] for node in workers]
+    worker_ready = [node["name"] for node in workers if node.get("ready") is True]
+    worker_not_ready = [node["name"] for node in workers if node.get("ready") is False]
+    expected_workers = expected_worker_nodes_from_metrics()
+    expected_ready = [n for n in expected_workers if n in ready] if expected_workers else []
+    expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else []
+    expected_missing = [n for n in expected_workers if n not in names] if expected_workers else []
+    return {
+        "names": sorted(names),
+        "ready": sorted(ready),
+        "not_ready": sorted(not_ready),
+        "groups": groups,
+        "worker_names": sorted(worker_names),
+        "worker_ready": sorted(worker_ready),
+        "worker_not_ready": sorted(worker_not_ready),
+        "expected_workers": expected_workers,
+        "expected_ready": sorted(expected_ready),
+        "expected_not_ready": sorted(expected_not_ready),
+        "expected_missing": sorted(expected_missing),
+    }
+
+
+def _workload_tokens(entry: dict[str, Any]) -> set[str]:
+    tokens: set[str] = set()
+    for key in ("workload", "namespace"):
+        value = entry.get(key)
+        if isinstance(value, str) and value:
+            tokens.update(_tokens(value))
+    return tokens
+
+
+def _workload_query_target(prompt: str) -> str:
+    tokens = set(_tokens(prompt))
+    matches = sorted(tokens & _NAME_INDEX) if _NAME_INDEX else []
+    return matches[0] if matches else ""
+
+
+def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None:
+    q_tokens = set(_tokens(prompt))
+    if not q_tokens:
+        return None
+    scored: list[tuple[int, dict[str, Any]]] = []
+    for entry in workloads:
+        if not isinstance(entry, dict):
+            continue
+        tokens = _workload_tokens(entry)
+        score = len(tokens & q_tokens)
+        name = (entry.get("workload") or "").lower()
+        namespace = (entry.get("namespace") or "").lower()
+        if name and name in q_tokens:
+            score += 5
+        if namespace and namespace in q_tokens:
+            score += 3
+        if score:
+            scored.append((score, entry))
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return scored[0][1]
+
+
+def _format_confidence(answer: str, confidence: str) -> str:
+    if not answer:
+        return ""
+    return f"{answer}\nConfidence: {confidence}."
+
+
+def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str:
+    q = normalize_query(prompt)
+    if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")):
+        return ""
+    target = _workload_query_target(prompt)
+    entry = _select_workload(prompt, workloads)
+    if not entry:
+        return ""
+    workload = entry.get("workload") or ""
+    namespace = entry.get("namespace") or ""
+    if target:
+        workload_l = str(workload).lower()
+        namespace_l = str(namespace).lower()
+        if workload_l != target and namespace_l == target and "namespace" not in q and "workload" not in q:
+            return ""
+    nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {}
+    primary = entry.get("primary_node") or ""
+    if not workload or not nodes:
+        return ""
+    parts = []
+    if primary:
+        parts.append(f"{primary} (primary)")
+    for node, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0])):
+        if node == primary:
+            continue
+        parts.append(f"{node} ({count} pod{'s' if count != 1 else ''})")
+    node_text = ", ".join(parts) if parts else primary
+    answer = f"{workload} runs in {namespace}. Nodes: {node_text}."
+    return _format_confidence(answer, "medium")
+
+
+def _snapshot_metrics(snapshot: dict[str, Any] | None) -> dict[str, Any]:
+    if not snapshot:
+        return {}
+    metrics = snapshot.get("metrics")
+    return metrics if isinstance(metrics, dict) else {}
+
+
+def _node_usage_top(
+    usage: list[dict[str, Any]],
+    *,
+    allowed_nodes: set[str] | None,
+) -> tuple[str, float] | None:
+    best_node = ""
+    best_val = None
+    for item in usage if isinstance(usage, list) else []:
+        if not isinstance(item, dict):
+            continue
+        node = item.get("node") or ""
+        if allowed_nodes and node not in allowed_nodes:
+            continue
+        value = item.get("value")
+        try:
+            numeric = float(value)
+        except (TypeError, ValueError):
+            continue
+        if best_val is None or numeric > best_val:
+            best_val = numeric
+            best_node = node
+    if best_node and best_val is not None:
+        return best_node, best_val
+    return None
+
+
+def _node_usage_bottom(
+    usage: list[dict[str, Any]],
+    *,
+    allowed_nodes: set[str] | None,
+) -> tuple[str, float] | None:
+    best_node: str | None = None
+    best_val: float | None = None
+    for item in usage:
+        if not isinstance(item, dict):
+            continue
+        node = item.get("node")
+        if not node or not isinstance(node, str):
+            continue
+        if allowed_nodes and node not in allowed_nodes:
+            continue
+        value = item.get("value")
+        try:
+            numeric = float(value)
+        except (TypeError, ValueError):
+            continue
+        if best_val is None or numeric < best_val:
+            best_val = numeric
+            best_node = node
+    if best_node and best_val is not None:
+        return best_node, best_val
+    return None
+
+
+def snapshot_metric_answer(
+    prompt: str,
+    *,
+    snapshot: dict[str, Any] | None,
+    inventory: list[dict[str, Any]],
+) -> str:
+    if not snapshot:
+        return ""
+    metrics = _snapshot_metrics(snapshot)
+    if not metrics:
+        return ""
+    q = normalize_query(prompt)
+    metric = _detect_metric(q)
+    op = _detect_operation(q)
+    if op == "list" and metric in {"cpu", "ram", "net", "io"}:
+        op = "top"
+    include_hw, exclude_hw = _detect_hardware_filters(q)
+    nodes_in_query = _extract_titan_nodes(q)
+    only_workers = "worker" in q or "workers" in q
+
+    filtered = _inventory_filter(
+        inventory,
+        include_hw=include_hw,
+        exclude_hw=exclude_hw,
+        only_workers=only_workers,
+        only_ready=None,
+        nodes_in_query=nodes_in_query,
+    )
+    allowed_nodes = {node["name"] for node in filtered} if filtered else None
+
+    if metric in {"cpu", "ram", "net", "io"} and op in {"top", "bottom", "status", None}:
+        usage = metrics.get("node_usage", {}).get(metric, [])
+        pick = _node_usage_bottom if op == "bottom" else _node_usage_top
+        chosen = pick(usage, allowed_nodes=allowed_nodes)
+        if chosen:
+            node, val = chosen
+            percent = metric in {"cpu", "ram"}
+            value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"})
+            scope = ""
+            if include_hw:
+                scope = f" among {' and '.join(sorted(include_hw))}"
+            label = "Lowest" if op == "bottom" else "Hottest"
+            answer = f"{label} node{scope}: {node} ({value})."
+            if allowed_nodes and len(allowed_nodes) != len(inventory) and op != "bottom":
+                overall = _node_usage_top(usage, allowed_nodes=None)
+                if overall and overall[0] != node:
+                    overall_val = _format_metric_value(
+                        str(overall[1]),
+                        percent=percent,
+                        rate=metric in {"net", "io"},
+                    )
+                    answer += f" Overall hottest: {overall[0]} ({overall_val})."
+            return _format_confidence(answer, "high")
+
+    if metric == "connections" or "postgres" in q:
+        postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+        used = postgres.get("used")
+        max_conn = postgres.get("max")
+        hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
+        parts: list[str] = []
+        if used is not None and max_conn is not None:
+            free = max_conn - used
+            if any(word in q for word in ("free", "available", "remaining", "remain", "left")):
+                parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).")
+            else:
+                parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.")
+        if hottest.get("label"):
+            hot_val = hottest.get("value")
+            hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
+            parts.append(f"Hottest DB: {hottest.get('label')} ({hot_val_str}).")
+        if parts:
+            return _format_confidence(" ".join(parts), "high")
+
+    if metric == "pods":
+        running = metrics.get("pods_running")
+        pending = metrics.get("pods_pending")
+        failed = metrics.get("pods_failed")
+        succeeded = metrics.get("pods_succeeded")
+        status_terms = ("running", "pending", "failed", "succeeded", "completed")
+        if ("most pods" in q or ("most" in q and "pod" in q and "node" in q)) and not nodes_in_query:
+            return _format_confidence(
+                "I don't have per-node pod counts in the snapshot.",
+                "medium",
+            )
+        if "total" in q or "sum" in q:
+            values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))]
+            if values:
+                return _format_confidence(f"Total pods: {sum(values):.0f}.", "high")
+        if "not running" in q or "not in running" in q or "non running" in q:
+            parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))]
+            if parts:
+                return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high")
+        if sum(1 for term in status_terms if term in q) > 1:
+            parts = []
+            if "running" in q and running is not None:
+                parts.append(f"running {running:.0f}")
+            if "pending" in q and pending is not None:
+                parts.append(f"pending {pending:.0f}")
+            if "failed" in q and failed is not None:
+                parts.append(f"failed {failed:.0f}")
+            if ("succeeded" in q or "completed" in q) and succeeded is not None:
+                parts.append(f"succeeded {succeeded:.0f}")
+            if parts:
+                return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
+        if "pending" in q and pending is not None:
+            return _format_confidence(f"Pending pods: {pending:.0f}.", "high")
+        if "failed" in q and failed is not None:
+            return _format_confidence(f"Failed pods: {failed:.0f}.", "high")
+        if "succeeded" in q or "completed" in q:
+            if succeeded is not None:
+                return _format_confidence(f"Succeeded pods: {succeeded:.0f}.", "high")
+        if "running" in q and running is not None:
+            return _format_confidence(f"Running pods: {running:.0f}.", "high")
+        parts = []
+        if running is not None:
+            parts.append(f"running {running:.0f}")
+        if pending is not None:
+            parts.append(f"pending {pending:.0f}")
+        if failed is not None:
+            parts.append(f"failed {failed:.0f}")
+        if succeeded is not None:
+            parts.append(f"succeeded {succeeded:.0f}")
+        if parts:
+            return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
+
+    return ""
+
+def structured_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    metrics_summary: str,
+    snapshot: dict[str, Any] | None = None,
+    workloads: list[dict[str, Any]] | None = None,
+) -> str:
+    q = normalize_query(prompt)
+    if not q:
+        return ""
+
+    if workloads:
+        workload_resp = workload_answer(prompt, workloads)
+        if workload_resp:
+            return workload_resp
+
+    snap_resp = snapshot_metric_answer(prompt, snapshot=snapshot, inventory=inventory)
+    if snap_resp:
+        return snap_resp
+
+    tokens = _tokens(q)
+    op = _detect_operation(q)
+    metric = _detect_metric(q)
+    if op == "list" and metric in {"cpu", "ram", "net", "io"}:
+        op = "top"
+    entity = _detect_entity(q)
+    include_hw, exclude_hw = _detect_hardware_filters(q)
+    if entity is None and (include_hw or exclude_hw):
+        entity = "node"
+    nodes_in_query = _extract_titan_nodes(q)
+    only_workers = "worker" in q or "workers" in q
+    role_filters = _detect_role_filters(q)
+    only_ready: bool | None = None
+    if (
+        "not ready" in q
+        or "notready" in q
+        or "not-ready" in q
+        or "unready" in q
+        or "down" in q
+        or "missing" in q
+    ):
+        only_ready = False
+    elif "ready" in q:
+        only_ready = True
+
+    if entity == "node" and only_ready is not None and op != "count":
+        op = "status"
+    if entity == "node" and only_ready is not None and op == "count":
+        if not any(term in q for term in ("how many", "count", "number")):
+            op = "status"
+
+    if not op and entity == "node":
+        op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
+
+    if entity == "node" and "total" in q and "ready" in q:
+        summary = _nodes_summary_line(inventory, snapshot)
+        if summary:
+            return _format_confidence(summary, "high")
+
+    if entity == "node" and ("hardware mix" in q or "architecture" in q):
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            return _format_confidence(hw_line, "high")
+
+    if (
+        entity == "node"
+        and op == "status"
+        and metric is None
+        and not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters)
+    ):
+        summary = _nodes_summary_line(inventory, snapshot)
+        if summary:
+            return _format_confidence(summary, "high")
+
+    if entity == "node" and metric is None and any(word in q for word in ("hardware", "architecture", "class", "mix")):
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            return _format_confidence(hw_line, "medium")
+
+    if (
+        entity == "node"
+        and any(term in q for term in ("arm64", "amd64"))
+        and any(term in q for term in ("mostly", "majority", "more"))
+    ):
+        arm64_count = len([n for n in inventory if n.get("arch") == "arm64"])
+        amd64_count = len([n for n in inventory if n.get("arch") == "amd64"])
+        if arm64_count or amd64_count:
+            majority = "arm64" if arm64_count >= amd64_count else "amd64"
+            return _format_confidence(
+                f"arm64 nodes: {arm64_count}, amd64 nodes: {amd64_count}. Mostly {majority}.",
+                "high",
+            )
+
+    if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")):
+        metric = "cpu"
+
+    # Metrics-first when a metric or top operation is requested.
+    if metric or op == "top":
+        entry = _select_metric_entry(tokens, metric=metric, op=op)
+        if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]:
+            expr = entry["exprs"][0]
+            if inventory:
+                scoped = _inventory_filter(
+                    inventory,
+                    include_hw=include_hw,
+                    exclude_hw=exclude_hw,
+                    only_workers=only_workers,
+                    only_ready=None,
+                    nodes_in_query=nodes_in_query,
+                )
+                if scoped:
+                    node_regex = "|".join([n["name"] for n in scoped])
+                    expr = _apply_node_filter(expr, node_regex)
+            res = vm_query(expr, timeout=20)
+            answer = ""
+            if op == "top" or "hottest" in (entry.get("panel_title") or "").lower():
+                node, val = _primary_series_metric(res)
+                if node and val is not None:
+                    percent = _metric_expr_uses_percent(entry)
+                    rate = metric in {"net", "io"}
+                    value_fmt = _format_metric_value(val or "", percent=percent, rate=rate)
+                    metric_label = (metric or "").upper()
+                    label = f"{metric_label} node" if metric_label else "node"
+                    answer = f"Hottest {label}: {node} ({value_fmt})."
+            if not answer:
+                answer = _format_metric_answer(entry, res)
+            if answer:
+                scope_parts: list[str] = []
+                if include_hw:
+                    scope_parts.append(" and ".join(sorted(include_hw)))
+                if exclude_hw:
+                    scope_parts.append(f"excluding {' and '.join(sorted(exclude_hw))}")
+                if only_workers:
+                    scope_parts.append("worker")
+                if scope_parts:
+                    scope = " ".join(scope_parts)
+                    overall_note = ""
+                    base_expr = entry["exprs"][0]
+                    if inventory:
+                        all_nodes = "|".join([n["name"] for n in inventory])
+                        if all_nodes:
+                            base_expr = _apply_node_filter(base_expr, all_nodes)
+                    base_res = vm_query(base_expr, timeout=20)
+                    base_node, base_val = _primary_series_metric(base_res)
+                    scoped_node, scoped_val = _primary_series_metric(res)
+                    if base_node and scoped_node and base_node != scoped_node:
+                        percent = _metric_expr_uses_percent(entry)
+                        rate = metric in {"net", "io"}
+                        base_val_fmt = _format_metric_value(base_val or "", percent=percent, rate=rate)
+                        overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})."
+                    return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high")
+                return _format_confidence(answer, "high")
+        if metrics_summary:
+            return metrics_summary
+
+    if entity != "node" or not inventory:
+        if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary:
+            return "I don't have data to answer that right now."
+        return ""
+
+    expected_workers = expected_worker_nodes_from_metrics()
+    filtered = _inventory_filter(
+        inventory,
+        include_hw=include_hw,
+        exclude_hw=exclude_hw,
+        only_workers=only_workers,
+        only_ready=only_ready if op in ("status", "count") else None,
+        nodes_in_query=nodes_in_query,
+    )
+    if role_filters:
+        filtered = [
+            node
+            for node in filtered
+            if role_filters.intersection(set(node.get("roles") or []))
+        ]
+    names = [node["name"] for node in filtered]
+
+    if op == "status":
+        scope_label = "nodes"
+        if include_hw:
+            scope_label = f"{' and '.join(sorted(include_hw))} nodes"
+        elif only_workers:
+            scope_label = "worker nodes"
+        if "missing" in q and ("ready" in q or "readiness" in q):
+            return _format_confidence(
+                f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".",
+                "high",
+            )
+        if "missing" in q and expected_workers:
+            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
+            return _format_confidence(
+                "Missing nodes: " + (", ".join(missing) if missing else "none") + ".",
+                "high",
+            )
+        if only_ready is False:
+            return _format_confidence(
+                f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".",
+                "high",
+            )
+        if only_ready is True:
+            return _format_confidence(
+                f"Ready {scope_label} ({len(names)}): " + (", ".join(names) if names else "none") + ".",
+                "high",
+            )
+
+    if op == "count":
+        scope_label = "nodes"
+        if include_hw:
+            scope_label = f"{' and '.join(sorted(include_hw))} nodes"
+        elif only_workers:
+            scope_label = "worker nodes"
+        if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q):
+            total_workers = _inventory_filter(
+                inventory,
+                include_hw=include_hw,
+                exclude_hw=exclude_hw,
+                only_workers=True,
+                only_ready=None,
+                nodes_in_query=nodes_in_query,
+            )
+            ready_workers = _inventory_filter(
+                inventory,
+                include_hw=include_hw,
+                exclude_hw=exclude_hw,
+                only_workers=True,
+                only_ready=True,
+                nodes_in_query=nodes_in_query,
+            )
+            return _format_confidence(
+                f"Worker nodes ready: {len(ready_workers)} / {len(total_workers)} total.",
+                "high",
+            )
+        if expected_workers and ("expected" in q or "should" in q):
+            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
+            msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+            if missing:
+                msg += f" Missing: {', '.join(missing)}."
+            return _format_confidence(msg, "high")
+        if only_ready is True:
+            return _format_confidence(f"Ready {scope_label}: {len(names)}.", "high")
+        if only_ready is False:
+            return _format_confidence(f"Not ready {scope_label}: {len(names)}.", "high")
+        if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters):
+            return _format_confidence(f"Atlas has {len(names)} nodes.", "high")
+        return _format_confidence(f"Matching nodes: {len(names)}.", "high")
+
+    if op == "list":
+        if nodes_in_query:
+            parts = []
+            existing = {n["name"] for n in inventory}
+            for node in nodes_in_query:
+                parts.append(f"{node}: {'present' if node in existing else 'not present'}")
+            return _format_confidence("Node presence: " + ", ".join(parts) + ".", "high")
+        if not names:
+            return _format_confidence("Matching nodes: none.", "high")
+        shown = names[:30]
+        suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else ""
+        return _format_confidence("Matching nodes: " + ", ".join(shown) + suffix + ".", "high")
+
+    return ""
+
+
+def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str:
+    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
+    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
+    total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
+    ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
+    not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
+    if total is None:
+        total = len(inventory)
+        ready = len([n for n in inventory if n.get("ready") is True])
+        not_ready = len([n for n in inventory if n.get("ready") is False])
+    if total is None:
+        return ""
+    if not_ready:
+        names = []
+        summary_names = summary.get("not_ready_names") if isinstance(summary, dict) else []
+        if isinstance(summary_names, list):
+            names = [name for name in summary_names if isinstance(name, str)]
+        if not names and snapshot:
+            details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
+            names = [node.get("name") for node in details if isinstance(node, dict) and node.get("ready") is False]
+        names = [name for name in names if isinstance(name, str) and name]
+        suffix = f" (not ready: {', '.join(names)})" if names else ""
+        return f"Atlas has {total} nodes; {ready} ready, {not_ready} not ready{suffix}."
+    return f"Atlas has {total} nodes and all are Ready."
+
+
+def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
+    if not inventory:
+        return ""
+    groups = _group_nodes(inventory)
+    parts: list[str] = []
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        nodes = groups.get(key) or []
+        if nodes:
+            parts.append(f"{key}={len(nodes)}")
+    if not parts:
+        return ""
+    return "Hardware mix includes " + ", ".join(parts) + "."
+
+
+def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
+    if not snapshot:
+        return ""
+    details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
+    counts: dict[str, int] = collections.Counter()
+    for node in details:
+        if not isinstance(node, dict):
+            continue
+        os_name = (node.get("os") or "").strip()
+        if os_name:
+            counts[os_name] += 1
+    if not counts or (len(counts) == 1 and "linux" in counts):
+        return ""
+    parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))]
+    return "OS mix: " + ", ".join(parts[:5]) + "."
+
+
+def _pods_summary_line(metrics: dict[str, Any]) -> str:
+    if not metrics:
+        return ""
+    running = metrics.get("pods_running")
+    pending = metrics.get("pods_pending")
+    failed = metrics.get("pods_failed")
+    succeeded = metrics.get("pods_succeeded")
+    if running is None and pending is None and failed is None and succeeded is None:
+        return ""
+    parts: list[str] = []
+    if running is not None:
+        parts.append(f"{running:.0f} running")
+    if pending is not None:
+        parts.append(f"{pending:.0f} pending")
+    if failed is not None:
+        parts.append(f"{failed:.0f} failed")
+    if succeeded is not None:
+        parts.append(f"{succeeded:.0f} succeeded")
+    return "There are " + ", ".join(parts) + " pods."
+
+
+def _postgres_summary_line(metrics: dict[str, Any]) -> str:
+    if not metrics:
+        return ""
+    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+    if not postgres:
+        return ""
+    used = postgres.get("used")
+    max_conn = postgres.get("max")
+    hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
+    parts: list[str] = []
+    if used is not None and max_conn is not None:
+        parts.append(f"{used:.0f}/{max_conn:.0f} connections")
+    if hottest.get("label"):
+        hot_val = hottest.get("value")
+        hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
+        parts.append(f"hottest {hottest.get('label')} ({hot_val_str})")
+    if not parts:
+        return ""
+    return "Postgres is at " + ", ".join(parts) + "."
+
+
+def _hottest_summary_line(metrics: dict[str, Any]) -> str:
+    if not metrics:
+        return ""
+    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+    if not hottest:
+        return ""
+    parts: list[str] = []
+    for key in ("cpu", "ram", "net", "io"):
+        entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
+        node = entry.get("node")
+        value = entry.get("value")
+        if node and value is not None:
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=key in ("cpu", "ram"),
+                rate=key in ("net", "io"),
+            )
+            parts.append(f"{key.upper()} {node} ({value_fmt})")
+    if not parts:
+        return ""
+    return "Hot spots: " + "; ".join(parts) + "."
+
+
+_FOLLOWUP_HINTS = (
+    "what about",
+    "how about",
+    "and what",
+    "and how",
+    "tell me more",
+    "anything else",
+    "something else",
+    "that one",
+    "those",
+    "them",
+    "it",
+    "this",
+    "that",
+    "else",
+    "another",
+    "again",
+)
+
+
+def _is_followup_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    if any(hint in q for hint in _FOLLOWUP_HINTS):
+        return True
+    if len(q.split()) <= 3 and not any(word in q for word in _INSIGHT_HINT_WORDS):
+        return True
+    return False
+
+
+def _is_subjective_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    return any(word in q for word in _INSIGHT_HINT_WORDS) or any(
+        phrase in q
+        for phrase in (
+            "what do you think",
+            "your favorite",
+            "your favourite",
+            "your opinion",
+        )
+    )
+
+
+def _is_overview_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    return any(word in q for word in _OVERVIEW_HINT_WORDS)
+
+
+def _doc_intent(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    return any(
+        phrase in q
+        for phrase in (
+            "runbook",
+            "documentation",
+            "docs",
+            "guide",
+            "how do i",
+            "how to",
+            "instructions",
+            "playbook",
+            "next step",
+            "next steps",
+            "what should",
+            "what do i",
+            "what to do",
+            "troubleshoot",
+            "triage",
+            "recover",
+            "remediate",
+        )
+    )
+
+
+def cluster_overview_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+) -> str:
+    if not inventory and not snapshot:
+        return ""
+    q = normalize_query(prompt)
+    metrics = _snapshot_metrics(snapshot)
+    sentences: list[str] = []
+
+    nodes_line = _nodes_summary_line(inventory, snapshot)
+    if nodes_line:
+        sentences.append(nodes_line)
+
+    wants_overview = _is_overview_query(q) or any(word in q for word in ("atlas", "cluster", "titan", "lab"))
+    wants_hardware = any(word in q for word in ("hardware", "architecture", "nodes", "node")) or wants_overview
+    wants_metrics = any(
+        word in q
+        for word in (
+            "status",
+            "health",
+            "overview",
+            "summary",
+            "pods",
+            "postgres",
+            "connections",
+            "hottest",
+            "cpu",
+            "ram",
+            "memory",
+            "net",
+            "network",
+            "io",
+            "disk",
+            "busy",
+            "load",
+            "usage",
+            "utilization",
+        )
+    ) or wants_overview
+
+    if wants_hardware:
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            sentences.append(hw_line)
+        os_line = _os_mix_line(snapshot)
+        if os_line:
+            sentences.append(os_line)
+
+    if wants_metrics:
+        pods_line = _pods_summary_line(metrics)
+        if pods_line:
+            sentences.append(pods_line)
+        postgres_line = _postgres_summary_line(metrics)
+        if postgres_line:
+            sentences.append(postgres_line)
+        hottest_line = _hottest_summary_line(metrics)
+        if hottest_line:
+            sentences.append(hottest_line)
+
+    if not sentences:
+        return ""
+    if len(sentences) > 3 and not wants_overview:
+        sentences = sentences[:3]
+    return "Based on the latest snapshot, " + " ".join(sentences)
+
+
+def cluster_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]] | None,
+    history_lines: list[str] | None = None,
+) -> str:
+    metrics_summary = snapshot_context(prompt, snapshot)
+    structured = structured_answer(
+        prompt,
+        inventory=inventory,
+        metrics_summary=metrics_summary,
+        snapshot=snapshot,
+        workloads=workloads,
+    )
+    if structured:
+        return structured
+
+    q = normalize_query(prompt)
+    workload_target = _workload_query_target(prompt)
+    if workload_target and any(word in q for word in ("where", "run", "running", "host", "node")):
+        return _format_confidence(
+            f"I don't have workload placement data for {workload_target} in the current snapshot.",
+            "low",
+        )
+
+    overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
+    if overview:
+        kb_titles = kb_retrieve_titles(prompt, limit=4) if _doc_intent(prompt) else ""
+        if kb_titles:
+            overview = overview + "\n" + kb_titles
+        return _format_confidence(overview, "medium")
+
+    kb_titles = kb_retrieve_titles(prompt, limit=4)
+    if kb_titles:
+        return _format_confidence(kb_titles, "low")
+
+    if metrics_summary:
+        return _format_confidence(metrics_summary, "low")
+
+    return ""
+
+def _metric_tokens(entry: dict[str, Any]) -> str:
+    parts: list[str] = []
+    for key in ("panel_title", "dashboard", "description"):
+        val = entry.get(key)
+        if isinstance(val, str) and val:
+            parts.append(val.lower())
+    tags = entry.get("tags")
+    if isinstance(tags, list):
+        parts.extend(str(t).lower() for t in tags if t)
+    return " ".join(parts)
+
+def metrics_lookup(query: str, limit: int = 3) -> list[dict[str, Any]]:
+    q_tokens = _tokens(query)
+    if not q_tokens or not _METRIC_INDEX:
+        return []
+    scored: list[tuple[int, dict[str, Any]]] = []
+    for entry in _METRIC_INDEX:
+        if not isinstance(entry, dict):
+            continue
+        hay = _metric_tokens(entry)
+        if not hay:
+            continue
+        score = 0
+        for t in set(q_tokens):
+            if t in hay:
+                score += 2 if t in (entry.get("panel_title") or "").lower() else 1
+        if score:
+            scored.append((score, entry))
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return [entry for _, entry in scored[:limit]]
+
+def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
+    if not allow_tools:
+        return "", ""
+    lower = (prompt or "").lower()
+    if not any(word in lower for word in METRIC_HINT_WORDS):
+        return "", ""
+    matches = metrics_lookup(prompt, limit=1)
+    if not matches:
+        return "", ""
+    entry = matches[0]
+    dashboard = entry.get("dashboard") or "dashboard"
+    panel = entry.get("panel_title") or "panel"
+    exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
+    if not exprs:
+        return "", ""
+    rendered_parts: list[str] = []
+    for expr in exprs[:2]:
+        res = vm_query(expr, timeout=20)
+        rendered = vm_render_result(res, limit=10)
+        if rendered:
+            rendered_parts.append(rendered)
+    if not rendered_parts:
+        return "", ""
+    summary = "\n".join(rendered_parts)
+    context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
+    return context, ""
+
 def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
     q = (query or "").strip()
     if not q or not KB.get("catalog"):
@@ -295,6 +2278,73 @@ def k8s_get(path: str, timeout: int = 8) -> dict:
         raw = resp.read()
         return json.loads(raw.decode()) if raw else {}
 
+def _ariadne_state(timeout: int = 5) -> dict | None:
+    if not ARIADNE_STATE_URL:
+        return None
+    headers = {}
+    if ARIADNE_STATE_TOKEN:
+        headers["X-Internal-Token"] = ARIADNE_STATE_TOKEN
+    r = request.Request(ARIADNE_STATE_URL, headers=headers, method="GET")
+    try:
+        with request.urlopen(r, timeout=timeout) as resp:
+            raw = resp.read()
+            payload = json.loads(raw.decode()) if raw else {}
+            return payload if isinstance(payload, dict) else None
+    except Exception:
+        return None
+
+
+_SNAPSHOT_CACHE: dict[str, Any] = {"payload": None, "ts": 0.0}
+
+
+def _snapshot_state() -> dict[str, Any] | None:
+    now = time.monotonic()
+    cached = _SNAPSHOT_CACHE.get("payload")
+    ts = _SNAPSHOT_CACHE.get("ts") or 0.0
+    if cached and now - ts < max(5, SNAPSHOT_TTL_SEC):
+        return cached
+    payload = _ariadne_state(timeout=10)
+    if isinstance(payload, dict) and payload:
+        _SNAPSHOT_CACHE["payload"] = payload
+        _SNAPSHOT_CACHE["ts"] = now
+        return payload
+    return cached if isinstance(cached, dict) else None
+
+
+def _snapshot_inventory(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]:
+    if not snapshot:
+        return []
+    items = snapshot.get("nodes_detail")
+    if not isinstance(items, list):
+        return []
+    inventory: list[dict[str, Any]] = []
+    for node in items:
+        if not isinstance(node, dict):
+            continue
+        labels = node.get("labels") if isinstance(node.get("labels"), dict) else {}
+        name = node.get("name") or ""
+        if not name:
+            continue
+        hardware = node.get("hardware") or _hardware_class(labels)
+        inventory.append(
+            {
+                "name": name,
+                "arch": node.get("arch") or labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
+                "hardware": hardware,
+                "roles": node.get("roles") or [],
+                "is_worker": node.get("is_worker") is True,
+                "ready": node.get("ready") is True,
+            }
+        )
+    return sorted(inventory, key=lambda item: item["name"])
+
+
+def _snapshot_workloads(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]:
+    if not snapshot:
+        return []
+    workloads = snapshot.get("workloads")
+    return workloads if isinstance(workloads, list) else []
+
 def k8s_pods(namespace: str) -> list[dict]:
     data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500")
     items = data.get("items") or []
@@ -404,6 +2454,86 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str:
         out.append(f"- {labels}: {val}")
     return "\n".join(out)
 
+def _parse_metric_lines(summary: str) -> dict[str, str]:
+    parsed: dict[str, str] = {}
+    for line in (summary or "").splitlines():
+        line = line.strip()
+        if not line.startswith("-"):
+            continue
+        try:
+            label, value = line.lstrip("-").split(":", 1)
+        except ValueError:
+            continue
+        parsed[label.strip()] = value.strip()
+    return parsed
+
+def _metrics_fallback_summary(panel: str, summary: str) -> str:
+    parsed = _parse_metric_lines(summary)
+    panel_l = (panel or "").lower()
+    if parsed:
+        items = list(parsed.items())
+        if len(items) == 1:
+            label, value = items[0]
+            return f"{panel}: {label} = {value}."
+        compact = "; ".join(f"{k}={v}" for k, v in items)
+        return f"{panel}: {compact}."
+    if panel_l:
+        return f"{panel}: {summary}"
+    return summary
+
+def _node_ready_status(node: dict) -> bool | None:
+    conditions = node.get("status", {}).get("conditions") or []
+    for cond in conditions if isinstance(conditions, list) else []:
+        if cond.get("type") == "Ready":
+            if cond.get("status") == "True":
+                return True
+            if cond.get("status") == "False":
+                return False
+            return None
+    return None
+
+def _node_is_worker(node: dict) -> bool:
+    labels = (node.get("metadata") or {}).get("labels") or {}
+    if labels.get("node-role.kubernetes.io/control-plane") is not None:
+        return False
+    if labels.get("node-role.kubernetes.io/master") is not None:
+        return False
+    if labels.get("node-role.kubernetes.io/worker") is not None:
+        return True
+    return True
+
+def worker_nodes_status(inventory: list[dict[str, Any]] | None = None) -> tuple[list[str], list[str]]:
+    if inventory is None:
+        inventory = node_inventory()
+    ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is True]
+    not_ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is False]
+    return (sorted(ready_nodes), sorted(not_ready_nodes))
+
+def expected_worker_nodes_from_metrics() -> list[str]:
+    for entry in _METRIC_INDEX:
+        panel = (entry.get("panel_title") or "").lower()
+        if "worker nodes ready" not in panel:
+            continue
+        exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
+        for expr in exprs:
+            if not isinstance(expr, str):
+                continue
+            match = NODE_REGEX.search(expr)
+            if not match:
+                continue
+            raw = match.group(1)
+            nodes = [n.strip() for n in raw.split("|") if n.strip()]
+            return sorted(nodes)
+    return []
+
+def _context_fallback(context: str) -> str:
+    if not context:
+        return ""
+    trimmed = context.strip()
+    if len(trimmed) > MAX_TOOL_CHARS:
+        trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..."
+    return "Here is what I found:\n" + trimmed
+
 def vm_top_restarts(hours: int = 1) -> str:
     q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
     res = vm_query(q)
@@ -442,6 +2572,1832 @@ def vm_cluster_snapshot() -> str:
         parts.append(pr)
     return "\n".join(parts).strip()
 
+def _strip_code_fence(text: str) -> str:
+    cleaned = (text or "").strip()
+    match = CODE_FENCE_RE.match(cleaned)
+    if match:
+        return match.group(1).strip()
+    return cleaned
+
+def _normalize_reply(value: Any) -> str:
+    if isinstance(value, dict):
+        for key in ("content", "response", "reply", "message"):
+            if key in value:
+                return _normalize_reply(value[key])
+        for v in value.values():
+            if isinstance(v, (str, dict, list)):
+                return _normalize_reply(v)
+        return json.dumps(value, ensure_ascii=False)
+    if isinstance(value, list):
+        parts = [_normalize_reply(item) for item in value]
+        return " ".join(p for p in parts if p)
+    if value is None:
+        return ""
+    text = _strip_code_fence(str(value))
+    if text.startswith("{") and text.endswith("}"):
+        try:
+            return _normalize_reply(json.loads(text))
+        except Exception:
+            return _ensure_confidence(text)
+    return _ensure_confidence(text)
+
+
+def _history_payload_lines(history_payload: list[Any]) -> list[str]:
+    lines: list[str] = []
+    if not isinstance(history_payload, list):
+        return lines
+    for item in history_payload[-12:]:
+        if isinstance(item, dict):
+            for key in ("content", "message", "text", "prompt", "question", "body", "answer", "reply", "response"):
+                val = item.get(key)
+                if isinstance(val, str) and val.strip():
+                    lines.append(val.strip())
+        elif isinstance(item, str) and item.strip():
+            lines.append(item.strip())
+    return [line for line in lines if line]
+
+
+def _append_history_context(context: str, history_lines: list[str]) -> str:
+    lines = [line.strip() for line in history_lines if isinstance(line, str) and line.strip()]
+    if not lines:
+        return context
+    snippet = "\n".join(lines[-6:])
+    combined = context + "\nRecent chat:\n" + snippet if context else "Recent chat:\n" + snippet
+    if len(combined) > MAX_CONTEXT_CHARS:
+        combined = combined[: MAX_CONTEXT_CHARS - 3].rstrip() + "..."
+    return combined
+
+
+class ThoughtState:
+    def __init__(self, total_steps: int = 0):
+        self._lock = threading.Lock()
+        self.stage = "starting"
+        self.note = ""
+        self.step = 0
+        self.total_steps = total_steps
+
+    def update(self, stage: str, *, note: str = "", step: int | None = None) -> None:
+        with self._lock:
+            self.stage = stage
+            if note:
+                self.note = note
+            if step is not None:
+                self.step = step
+
+    def status_line(self) -> str:
+        with self._lock:
+            stage = self.stage
+            note = self.note
+            step = self.step
+            total = self.total_steps
+        step_part = f"{step}/{total}" if total else str(step) if step else ""
+        detail = f"Stage {step_part}: {stage}".strip()
+        if note:
+            return f"Still thinking ({detail}). Latest insight: {note}"
+        return f"Still thinking ({detail})."
+
+
+def _ollama_json_call(
+    prompt: str,
+    *,
+    context: str,
+    retries: int = 2,
+    model: str | None = None,
+) -> dict[str, Any]:
+    system = (
+        "System: You are Atlas, a reasoning assistant. "
+        "Return strict JSON only (no code fences, no trailing commentary). "
+        "If you cannot comply, return {}. "
+        "Only use facts from the provided context. "
+        "If you make an inference, label it as 'inference' in the JSON."
+    )
+    last_exc: Exception | None = None
+    for attempt in range(max(1, retries + 1)):
+        try:
+            raw = _ollama_call(
+                ("json", "internal"),
+                prompt,
+                context=context,
+                use_history=False,
+                system_override=system,
+                model=model,
+            )
+            cleaned = _strip_code_fence(raw).strip()
+            if cleaned.startswith("{") and cleaned.endswith("}"):
+                return json.loads(cleaned)
+            last = json.loads(_strip_code_fence(cleaned))
+            if isinstance(last, dict):
+                return last
+        except Exception as exc:  # noqa: BLE001
+            last_exc = exc
+            time.sleep(min(2, 2 ** attempt))
+    if last_exc:
+        return {}
+    return {}
+
+
+def _fact_pack_lines(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> list[str]:
+    raw = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    lines: list[str] = []
+    for line in raw.splitlines():
+        trimmed = line.strip()
+        if not trimmed or trimmed.lower().startswith("facts"):
+            continue
+        lines.append(trimmed)
+    if _knowledge_intent(prompt) or _doc_intent(prompt) or _is_overview_query(prompt):
+        kb_titles = kb_retrieve_titles(prompt, limit=4)
+        if kb_titles:
+            for kb_line in kb_titles.splitlines():
+                if kb_line.strip():
+                    lines.append(kb_line.strip())
+    return lines
+
+
+def _fact_pack_text(lines: list[str], fact_meta: dict[str, dict[str, Any]]) -> str:
+    labeled: list[str] = []
+    for idx, line in enumerate(lines):
+        fid = f"F{idx + 1}"
+        tags = fact_meta.get(fid, {}).get("tags") or []
+        tag_text = f" [tags: {', '.join(tags)}]" if tags else ""
+        labeled.append(f"{fid}{tag_text}: {line}")
+    return "Fact pack:\n" + "\n".join(labeled)
+
+
+def _tool_fact_lines(prompt: str, *, allow_tools: bool) -> list[str]:
+    if not allow_tools:
+        return []
+    metrics_context, _ = metrics_query_context(prompt, allow_tools=True)
+    lines: list[str] = []
+    if metrics_context:
+        for line in metrics_context.splitlines():
+            trimmed = line.strip()
+            if trimmed:
+                lines.append(f"tool_metrics: {trimmed}")
+    return lines
+
+
+_ALLOWED_INSIGHT_TAGS = {
+    "availability",
+    "architecture",
+    "database",
+    "hardware",
+    "inventory",
+    "node_detail",
+    "os",
+    "pods",
+    "utilization",
+    "workloads",
+    "workers",
+}
+
+_DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"}
+_INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"}
+_SUBJECTIVE_TAG_PRIORITY = (
+    "utilization",
+    "database",
+    "pods",
+    "workloads",
+    "availability",
+    "hardware",
+    "inventory",
+    "architecture",
+    "node_detail",
+    "os",
+)
+
+
+def _fact_line_tags(line: str) -> set[str]:
+    text = (line or "").lower()
+    tags: set[str] = set()
+    if any(key in text for key in ("nodes_total", "ready", "not_ready", "workers_ready", "workers_not_ready")):
+        tags.add("availability")
+    if "nodes_by_arch" in text or "arch " in text or "architecture" in text:
+        tags.add("architecture")
+    if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")):
+        tags.update({"hardware", "inventory"})
+    if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text:
+        tags.add("inventory")
+    if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")):
+        tags.add("utilization")
+    if "postgres_" in text or "postgres connections" in text:
+        tags.add("database")
+    if "pods_" in text or "pod phases" in text or "restarts" in text:
+        tags.add("pods")
+    if "namespace" in text:
+        tags.add("workloads")
+    if "workloads" in text or "primary_node" in text or "workload_" in text:
+        tags.add("workloads")
+    if "node_details" in text:
+        tags.add("node_detail")
+    if "os mix" in text or "os" in text:
+        tags.add("os")
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _fact_pack_meta(lines: list[str]) -> dict[str, dict[str, Any]]:
+    meta: dict[str, dict[str, Any]] = {}
+    for idx, line in enumerate(lines):
+        fid = f"F{idx + 1}"
+        tags = sorted(_fact_line_tags(line))
+        meta[fid] = {"tags": tags}
+    return meta
+
+
+def _history_tags(history_lines: list[str]) -> set[str]:
+    tags: set[str] = set()
+    for line in history_lines[-6:]:
+        tags.update(_fact_line_tags(line))
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _normalize_fraction(value: Any, *, default: float = 0.5) -> float:
+    if isinstance(value, (int, float)):
+        score = float(value)
+        if score > 1:
+            score = score / 100.0
+        return max(0.0, min(1.0, score))
+    return default
+
+
+def _seed_insights(
+    lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    *,
+    limit: int = 6,
+) -> list[dict[str, Any]]:
+    priority = [
+        "utilization",
+        "database",
+        "pods",
+        "workloads",
+        "availability",
+        "hardware",
+        "architecture",
+        "inventory",
+    ]
+    seeds: list[dict[str, Any]] = []
+    used_tags: set[str] = set()
+    for tag in priority:
+        for idx, line in enumerate(lines):
+            fid = f"F{idx + 1}"
+            tags = set(fact_meta.get(fid, {}).get("tags") or [])
+            if tag not in tags or fid in {s["fact_ids"][0] for s in seeds}:
+                continue
+            summary = line.lstrip("- ").strip()
+            seeds.append(
+                {
+                    "summary": summary,
+                    "fact_ids": [fid],
+                    "relevance": 0.5,
+                    "novelty": 0.5,
+                    "rationale": "seeded from fact pack",
+                    "tags": sorted(tags),
+                }
+            )
+            used_tags.update(tags)
+            if len(seeds) >= limit:
+                return seeds
+    return seeds
+
+
+def _insight_tags(insight: dict[str, Any], fact_meta: dict[str, dict[str, Any]]) -> set[str]:
+    tags: set[str] = set()
+    for fid in insight.get("fact_ids") if isinstance(insight.get("fact_ids"), list) else []:
+        tags.update(fact_meta.get(fid, {}).get("tags") or [])
+    raw_tags = insight.get("tags") if isinstance(insight.get("tags"), list) else []
+    tags.update(t for t in raw_tags if isinstance(t, str))
+    summary = insight.get("summary") or insight.get("claim") or ""
+    if isinstance(summary, str):
+        tags.update(_fact_line_tags(summary))
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _insight_score(
+    insight: dict[str, Any],
+    *,
+    preference: str,
+    prefer_tags: set[str],
+    avoid_tags: set[str],
+    history_tags: set[str],
+    fact_meta: dict[str, dict[str, Any]],
+) -> float:
+    base = _score_insight(insight, preference)
+    tags = _insight_tags(insight, fact_meta)
+    if prefer_tags and tags:
+        base += 0.15 * len(tags & prefer_tags)
+    if avoid_tags and tags:
+        base -= 0.12 * len(tags & avoid_tags)
+    if history_tags and tags:
+        base -= 0.08 * len(tags & history_tags)
+    if preference == "novelty":
+        if tags & _DYNAMIC_TAGS:
+            base += 0.12
+        if tags & _INVENTORY_TAGS:
+            base -= 0.08
+    return base
+
+
+def _score_insight(insight: dict[str, Any], preference: str) -> float:
+    relevance = _normalize_fraction(insight.get("relevance"), default=0.6)
+    novelty = _normalize_fraction(insight.get("novelty"), default=0.5)
+    if preference == "novelty":
+        return novelty * 0.6 + relevance * 0.4
+    return relevance * 0.6 + novelty * 0.4
+
+
+def _select_diverse_insights(
+    candidates: list[dict[str, Any]],
+    *,
+    preference: str,
+    prefer_tags: set[str],
+    avoid_tags: set[str],
+    history_tags: set[str],
+    fact_meta: dict[str, dict[str, Any]],
+    count: int = 2,
+) -> list[dict[str, Any]]:
+    scored: list[tuple[float, dict[str, Any]]] = []
+    for item in candidates:
+        tags = _insight_tags(item, fact_meta)
+        item["tags"] = sorted(tags)
+        score = _insight_score(
+            item,
+            preference=preference,
+            prefer_tags=prefer_tags,
+            avoid_tags=avoid_tags,
+            history_tags=history_tags,
+            fact_meta=fact_meta,
+        )
+        scored.append((score, item))
+    scored.sort(key=lambda pair: pair[0], reverse=True)
+    picked: list[dict[str, Any]] = []
+    used_tags: set[str] = set()
+    for _, item in scored:
+        tags = set(item.get("tags") or [])
+        if used_tags and tags and tags <= used_tags and len(picked) < count:
+            continue
+        picked.append(item)
+        used_tags.update(tags)
+        if len(picked) >= count:
+            break
+    if len(picked) < count:
+        for _, item in scored:
+            if item in picked:
+                continue
+            picked.append(item)
+            if len(picked) >= count:
+                break
+    return picked
+
+
+def _open_ended_system() -> str:
+    return (
+        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
+        "Use ONLY the provided fact pack and recent chat as your evidence. "
+        "You may draw light inferences if you label them as such. "
+        "Write concise, human sentences with a helpful, calm tone (not a list). "
+        "Be willing to take a light stance; do not over-hedge. "
+        "If the question is subjective (cool/interesting/unconventional), pick a standout fact, explain why it stands out, "
+        "and use 2-3 sentences. "
+        "If the question asks for a list, embed the list inline in a sentence (comma-separated). "
+        "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
+        "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
+        "Always include at least one substantive answer sentence before the score lines. "
+        "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. "
+        "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. "
+        "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. "
+        "Do not convert counts into percentages or claim 100% unless a fact explicitly states a percentage. "
+        "Do not invent numbers or facts. "
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)."
+    )
+
+
+def _ollama_call_safe(
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    fallback: str,
+    system_override: str | None = None,
+    model: str | None = None,
+) -> str:
+    try:
+        return _ollama_call(
+            hist_key,
+            prompt,
+            context=context,
+            use_history=False,
+            system_override=system_override,
+            model=model,
+        )
+    except Exception:
+        return fallback
+
+
+def _candidate_note(candidate: dict[str, Any]) -> str:
+    claim = str(candidate.get("focus") or candidate.get("answer") or "")
+    return claim[:160] + ("…" if len(claim) > 160 else "")
+
+
+def _ensure_scores(answer: str) -> str:
+    text = answer.strip()
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    score_map: dict[str, str] = {}
+    body_lines: list[str] = []
+
+    def _score_key(line: str) -> str:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        return cleaned.lower()
+
+    def _extract_value(line: str) -> str:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        if ":" in cleaned:
+            return cleaned.split(":", 1)[1].strip()
+        parts = cleaned.split()
+        return parts[1] if len(parts) > 1 else ""
+
+    def _record_score(key: str, value: str):
+        if not value:
+            return
+        value = value.strip().rstrip("%")
+        score_map.setdefault(key, value)
+
+    for line in lines:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        lowered = cleaned.lower()
+        if lowered.startswith("confidence,") or (
+            "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered
+        ):
+            for key in ("confidence", "relevance", "satisfaction"):
+                match = re.search(rf"{key}\s*[:=]?\s*(\d{{1,3}}|high|medium|low)", lowered)
+                if match:
+                    _record_score(key, match.group(1))
+            risk_match = re.search(r"hallucination\s*risk\s*[:=]?\s*(low|medium|high)", lowered)
+            if risk_match:
+                _record_score("hallucinationrisk", risk_match.group(1))
+            continue
+        if lowered.startswith("confidence"):
+            _record_score("confidence", _extract_value(cleaned))
+            continue
+        if lowered.startswith("relevance"):
+            _record_score("relevance", _extract_value(cleaned))
+            continue
+        if lowered.startswith("satisfaction"):
+            _record_score("satisfaction", _extract_value(cleaned))
+            continue
+        if lowered.replace(" ", "").startswith("hallucinationrisk") or lowered.startswith(
+            "hallucination risk"
+        ):
+            _record_score("hallucinationrisk", _extract_value(cleaned))
+            continue
+        cleaned_body = re.sub(
+            r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*",
+            "",
+            line,
+            flags=re.IGNORECASE,
+        ).strip()
+        cleaned_body = re.sub(
+            r"\bconfident\s*level\s*:\s*(high|medium|low)\b\.?\s*",
+            "",
+            cleaned_body,
+            flags=re.IGNORECASE,
+        ).strip()
+        cleaned_body = re.sub(r"\bF\d+\b", "", cleaned_body).strip()
+        if cleaned_body:
+            body_lines.append(cleaned_body)
+
+    confidence = score_map.get("confidence") or "medium"
+    relevance = score_map.get("relevance") or "70"
+    satisfaction = score_map.get("satisfaction") or "70"
+    risk = score_map.get("hallucinationrisk") or "low"
+
+    final_lines = body_lines + [
+        f"Confidence: {confidence}",
+        f"Relevance: {relevance}",
+        f"Satisfaction: {satisfaction}",
+        f"HallucinationRisk: {risk}",
+    ]
+    return "\n".join(final_lines)
+
+
+def _open_ended_plan(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    count: int,
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+) -> list[dict[str, Any]]:
+    if state:
+        state.update("planning", step=step, note="mapping angles")
+    count = max(1, count)
+    focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any"
+    avoid_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
+    prompt_text = (
+        "Analyze the question and propose up to "
+        f"{count} distinct answer angles that can be supported by the fact pack. "
+        "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). "
+        "If the question is subjective, propose at least one angle that surfaces a standout detail. "
+        f"Prefer angles that align with these tags: {focus_hint}. "
+        f"Avoid angles that overlap these tags if possible: {avoid_hint}. "
+        "Avoid repeating the same angle as the most recent response if possible. "
+        "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"tags\":[\"tag\"],\"priority\":1-5}]}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    angles = result.get("angles") if isinstance(result, dict) else None
+    cleaned: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    if isinstance(angles, list):
+        for item in angles:
+            if not isinstance(item, dict):
+                continue
+            focus = str(item.get("focus") or "").strip()
+            if not focus or focus.lower() in seen:
+                continue
+            seen.add(focus.lower())
+            priority = item.get("priority")
+            if not isinstance(priority, (int, float)):
+                priority = 3
+            tags = _sanitize_focus_tags(item.get("tags") or [])
+            cleaned.append(
+                {
+                    "focus": focus,
+                    "reason": str(item.get("reason") or ""),
+                    "tags": tags,
+                    "priority": int(max(1, min(5, priority))),
+                }
+            )
+    if not cleaned:
+        cleaned = [{"focus": "Direct answer", "reason": "Default fallback", "priority": 3}]
+    cleaned.sort(key=lambda item: item.get("priority", 3), reverse=True)
+    if state:
+        state.update("planning", step=1, note=_candidate_note(cleaned[0]))
+    return cleaned
+
+
+def _sanitize_focus_tags(raw_tags: list[Any]) -> list[str]:
+    tags: list[str] = []
+    for tag in raw_tags:
+        if not isinstance(tag, str):
+            continue
+        tag = tag.strip()
+        if tag in _ALLOWED_INSIGHT_TAGS and tag not in tags:
+            tags.append(tag)
+    return tags
+
+
+def _open_ended_interpret(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    state: ThoughtState | None,
+    model: str | None,
+) -> dict[str, Any]:
+    if state:
+        state.update("interpreting", step=1, note="reading question")
+    allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS))
+    prompt_text = (
+        "Classify how to answer the question using only the fact pack. "
+        "Return JSON: {\"style\":\"objective|subjective\","
+        "\"tone\":\"neutral|curious|enthusiastic\","
+        "\"focus_tags\":[\"tag\"],"
+        "\"focus_label\":\"short phrase\","
+        "\"allow_list\":true|false}. "
+        "Use allow_list=true only if the question explicitly asks for names or lists. "
+        f"Only use tags from: {allowed_tags}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    if not isinstance(result, dict):
+        result = {}
+    style = str(result.get("style") or "").strip().lower()
+    if style not in ("objective", "subjective"):
+        style = "subjective" if _is_subjective_query(prompt) else "objective"
+    tone = str(result.get("tone") or "neutral").strip().lower()
+    if tone not in ("neutral", "curious", "enthusiastic"):
+        tone = "neutral"
+    focus_tags = _sanitize_focus_tags(result.get("focus_tags") or [])
+    focus_label = str(result.get("focus_label") or "").strip()
+    allow_list = result.get("allow_list")
+    if not isinstance(allow_list, bool):
+        q = normalize_query(prompt)
+        allow_list = any(phrase in q for phrase in ("list", "which", "what are", "names"))
+    return {
+        "style": style,
+        "tone": tone,
+        "focus_tags": focus_tags,
+        "focus_label": focus_label,
+        "allow_list": allow_list,
+    }
+
+
+def _preferred_tags_for_prompt(prompt: str) -> set[str]:
+    q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
+    tags: set[str] = set()
+    if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}:
+        tags.add("utilization")
+    if tokens & {"postgres", "database", "db", "connections"}:
+        tags.add("database")
+    if tokens & {"pod", "pods", "deployment", "job", "cronjob"}:
+        tags.add("pods")
+    if tokens & {"workload", "service", "namespace"}:
+        tags.add("workloads")
+    if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q:
+        tags.add("availability")
+    if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}:
+        tags.update({"hardware", "inventory", "architecture"})
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _primary_tags_for_prompt(prompt: str) -> set[str]:
+    q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
+    if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}:
+        return {"utilization"}
+    if tokens & {"postgres", "database", "db", "connections"}:
+        return {"database"}
+    if tokens & {"pod", "pods", "deployment", "job", "cronjob"}:
+        return {"pods"}
+    if tokens & {"workload", "service", "namespace"}:
+        return {"workloads"}
+    if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q:
+        return {"availability"}
+    if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}:
+        return {"hardware", "inventory", "architecture"}
+    return set()
+
+
+_TAG_KEYWORDS: dict[str, tuple[str, ...]] = {
+    "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"),
+    "database": ("postgres", "db", "database", "connections"),
+    "pods": ("pod", "pods", "deployment", "daemonset", "job", "cron", "workload"),
+    "hardware": ("hardware", "architecture", "arch", "rpi", "raspberry", "jetson", "amd64", "arm64", "node", "nodes"),
+    "availability": ("ready", "not ready", "unready", "down", "missing"),
+    "workloads": ("workload", "service", "namespace", "app"),
+    "os": ("os", "kernel", "kubelet", "containerd", "runtime"),
+}
+
+
+def _tags_from_text(text: str) -> set[str]:
+    q = normalize_query(text)
+    if not q:
+        return set()
+    tokens = set(_tokens(text))
+    tags: set[str] = set()
+    for tag, keywords in _TAG_KEYWORDS.items():
+        if any(word in tokens for word in keywords):
+            tags.add(tag)
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _history_focus_tags(history_lines: list[str]) -> set[str]:
+    if not history_lines:
+        return set()
+    recent = " ".join(line for line in history_lines[-6:] if isinstance(line, str))
+    return _tags_from_text(recent)
+
+
+def _open_ended_insights(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    count: int,
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+) -> list[dict[str, Any]]:
+    if state:
+        state.update("analyzing", step=step, note="scouting insights")
+    count = max(1, count)
+    allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS))
+    prompt_text = (
+        "Review the fact pack and propose up to "
+        f"{count} insights that could answer the question. "
+        "Each insight should be grounded in the facts. "
+        "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"],"
+        "\"relevance\":0-1,\"novelty\":0-1,\"tags\":[\"tag\"],\"rationale\":\"...\"}]}. "
+        f"Only use tags from: {allowed_tags}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    insights = result.get("insights") if isinstance(result, dict) else None
+    cleaned: list[dict[str, Any]] = []
+    valid_ids = set(fact_meta.keys())
+    if isinstance(insights, list):
+        for item in insights:
+            if not isinstance(item, dict):
+                continue
+            summary = str(item.get("summary") or item.get("claim") or "").strip()
+            if not summary:
+                continue
+            raw_ids = item.get("fact_ids") if isinstance(item.get("fact_ids"), list) else []
+            fact_ids = [fid for fid in raw_ids if isinstance(fid, str) and fid in valid_ids]
+            if not fact_ids:
+                continue
+            cleaned.append(
+                {
+                    "summary": summary,
+                    "fact_ids": fact_ids,
+                    "relevance": _normalize_fraction(item.get("relevance"), default=0.6),
+                    "novelty": _normalize_fraction(item.get("novelty"), default=0.5),
+                    "rationale": str(item.get("rationale") or ""),
+                    "tags": [t for t in (item.get("tags") or []) if isinstance(t, str)],
+                }
+            )
+    if cleaned and state:
+        state.update("analyzing", note=_candidate_note(cleaned[0]))
+    return cleaned
+
+
+def _rank_insights(
+    insights: list[dict[str, Any]],
+    *,
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    count: int,
+) -> list[dict[str, Any]]:
+    if not insights:
+        return []
+    ranked: list[tuple[float, dict[str, Any]]] = []
+    for insight in insights:
+        relevance = _normalize_fraction(insight.get("relevance"), default=0.6)
+        novelty = _normalize_fraction(insight.get("novelty"), default=0.5)
+        tags = set(insight.get("tags") or [])
+        score = relevance * 0.65 + novelty * 0.35
+        if focus_tags and tags & focus_tags:
+            score += 0.1
+        if avoid_tags and tags & avoid_tags:
+            score -= 0.2
+        ranked.append((score, insight))
+    ranked.sort(key=lambda item: item[0], reverse=True)
+    return [item for _, item in ranked[:count]]
+
+
+def _fallback_fact_ids(
+    fact_meta: dict[str, dict[str, Any]],
+    *,
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    count: int,
+) -> list[str]:
+    if not fact_meta:
+        return []
+    if focus_tags:
+        tagged = [
+            fid
+            for fid, meta in fact_meta.items()
+            if focus_tags & set(meta.get("tags") or [])
+        ]
+        if avoid_tags:
+            tagged = [fid for fid in tagged if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))]
+        if tagged:
+            return tagged[:count]
+    all_ids = list(fact_meta.keys())
+    if avoid_tags:
+        filtered = [fid for fid in all_ids if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))]
+        if filtered:
+            return filtered[:count]
+    return all_ids[:count]
+
+
+def _open_ended_select_facts(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    avoid_fact_ids: list[str],
+    count: int,
+    subjective: bool,
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+) -> list[str]:
+    if state:
+        state.update("selecting facts", step=step, note="picking evidence")
+    focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any"
+    avoid_tag_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
+    avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none"
+    prompt_text = (
+        "Select the fact IDs that best answer the question. "
+        f"Pick up to {count} fact IDs. "
+        f"Focus tags: {focus_hint}. "
+        f"Avoid these tags if possible: {avoid_tag_hint}. "
+        f"Avoid these fact IDs: {avoid_hint}. "
+        "If the question is subjective, pick standout or unusual facts; "
+        "if objective, pick the minimal facts needed. "
+        "Return JSON: {\"fact_ids\":[\"F1\"...],\"note\":\"...\"}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    fact_ids = result.get("fact_ids") if isinstance(result, dict) else None
+    selected: list[str] = []
+    if isinstance(fact_ids, list):
+        for fid in fact_ids:
+            if isinstance(fid, str) and fid in fact_meta and fid not in selected:
+                selected.append(fid)
+            if len(selected) >= count:
+                break
+    if avoid_tags:
+        selected = [
+            fid
+            for fid in selected
+            if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))
+        ] or selected
+    seed = _fallback_fact_ids(
+        fact_meta,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        count=count,
+    )
+    if selected:
+        for fid in seed:
+            if fid not in selected:
+                selected.append(fid)
+            if len(selected) >= count:
+                break
+    else:
+        selected = seed
+    return selected
+
+
+def _normalize_score(value: Any, *, default: int = 60) -> int:
+    if isinstance(value, (int, float)):
+        return int(max(0, min(100, value)))
+    return default
+
+
+def _confidence_score(value: Any) -> int:
+    text = str(value or "").strip().lower()
+    if text.startswith("high"):
+        return 85
+    if text.startswith("low"):
+        return 35
+    return 60
+
+
+def _risk_penalty(value: Any) -> int:
+    text = str(value or "").strip().lower()
+    if text.startswith("high"):
+        return 20
+    if text.startswith("medium"):
+        return 10
+    return 0
+
+
+def _open_ended_candidate(
+    prompt: str,
+    *,
+    focus: str,
+    fact_pack: str,
+    history_lines: list[str],
+    subjective: bool,
+    tone: str,
+    allow_list: bool,
+    state: ThoughtState | None,
+    step: int,
+    fact_hints: list[str] | None = None,
+    model: str | None = None,
+) -> dict[str, Any]:
+    if state:
+        state.update("drafting", step=step, note=focus)
+    hint_text = ""
+    if fact_hints:
+        hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "."
+    style_hint = (
+        "Offer a brief opinion grounded in facts and explain why it stands out. "
+        if subjective
+        else "Answer directly and succinctly. "
+    )
+    list_hint = (
+        "If a list is requested, embed it inline in a sentence (comma-separated). "
+        if allow_list
+        else "Avoid bullet lists. "
+    )
+    prompt_text = (
+        "Using ONLY the fact pack, answer the question focusing on this angle: "
+        f"{focus}. "
+        f"Tone: {tone}. "
+        + style_hint
+        + list_hint
+        + "Write 2-4 sentences in plain prose."
+        + hint_text
+        + " "
+        "If you infer, label it as inference. "
+        "List which fact pack IDs you used. "
+        "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\","
+        "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    if not isinstance(result, dict):
+        result = {}
+    answer = str(result.get("answer") or "").strip()
+    if not answer:
+        answer = "I don't have enough data to answer that from the current snapshot."
+    facts_used = result.get("facts_used")
+    if not isinstance(facts_used, list):
+        facts_used = []
+    candidate = {
+        "focus": focus,
+        "answer": answer,
+        "facts_used": facts_used,
+        "confidence": result.get("confidence", "medium"),
+        "relevance": _normalize_score(result.get("relevance"), default=60),
+        "satisfaction": _normalize_score(result.get("satisfaction"), default=60),
+        "risk": result.get("risk", "medium"),
+    }
+    candidate["score"] = _candidate_score(candidate)
+    return candidate
+
+
+def _candidate_score(candidate: dict[str, Any]) -> float:
+    relevance = _normalize_score(candidate.get("relevance"), default=60)
+    satisfaction = _normalize_score(candidate.get("satisfaction"), default=60)
+    confidence = _confidence_score(candidate.get("confidence"))
+    score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2
+    if not candidate.get("facts_used"):
+        score -= 5
+    return score - _risk_penalty(candidate.get("risk"))
+
+
+def _select_candidates(candidates: list[dict[str, Any]], *, count: int) -> list[dict[str, Any]]:
+    if not candidates:
+        return []
+    ranked = sorted(candidates, key=lambda item: item.get("score", 0), reverse=True)
+    picked: list[dict[str, Any]] = []
+    seen_focus: set[str] = set()
+    for item in ranked:
+        focus = str(item.get("focus") or "").strip().lower()
+        if focus and focus in seen_focus:
+            continue
+        picked.append(item)
+        if focus:
+            seen_focus.add(focus)
+        if len(picked) >= count:
+            break
+    return picked or ranked[:count]
+
+
+def _open_ended_synthesize(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    candidates: list[dict[str, Any]],
+    subjective: bool,
+    tone: str,
+    allow_list: bool,
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+    critique: str | None = None,
+) -> str:
+    if state:
+        state.update("synthesizing", step=step, note="composing answer")
+    critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n"
+    style_hint = (
+        "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. "
+        if subjective
+        else "Answer directly without extra caveats. "
+    )
+    list_hint = (
+        "If a list is requested, embed it inline in a sentence (comma-separated). "
+        if allow_list
+        else "Avoid bullet lists. "
+    )
+    synth_prompt = (
+        "Compose the final answer to the question using the candidate answers below. "
+        "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. "
+        "Use only the fact pack as evidence. "
+        "If you infer, label it as inference. "
+        "Do not claim nodes are missing or not ready unless the fact pack explicitly lists "
+        "nodes_not_ready or expected_workers_missing. "
+        f"Tone: {tone}. "
+        + style_hint
+        + list_hint
+        + "Keep the tone conversational and answer the user's intent directly. "
+        "Avoid repeating the last response if possible. "
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), "
+        "HallucinationRisk (low|medium|high).\n"
+        f"Question: {prompt}\n"
+        f"{critique_block}"
+        f"Candidates: {json.dumps(candidates, ensure_ascii=False)}"
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    reply = _ollama_call_safe(
+        ("open", "synth"),
+        synth_prompt,
+        context=context,
+        fallback="I don't have enough data to answer that.",
+        system_override=_open_ended_system(),
+        model=model,
+    )
+    return _ensure_scores(reply)
+
+
+def _open_ended_critique(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    candidates: list[dict[str, Any]],
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+) -> str:
+    if state:
+        state.update("reviewing", step=step, note="quality check")
+    critique_prompt = (
+        "Review the candidate answers against the fact pack. "
+        "Identify any missing important detail or risky inference and give one sentence of guidance. "
+        "Return JSON: {\"guidance\":\"...\",\"risk\":\"low|medium|high\"}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        critique_prompt + f" Question: {prompt} Candidates: {json.dumps(candidates, ensure_ascii=False)}",
+        context=context,
+        model=model,
+    )
+    if isinstance(result, dict):
+        guidance = str(result.get("guidance") or "").strip()
+        if guidance:
+            return guidance
+    return ""
+
+
+def _open_ended_multi(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    state: ThoughtState | None = None,
+) -> str:
+    model = _model_for_mode("deep")
+    total_steps = _open_ended_total_steps("deep")
+    if state:
+        state.total_steps = total_steps
+
+    interpretation = _open_ended_interpret(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        state=state,
+        model=model,
+    )
+    style = interpretation.get("style") or "objective"
+    subjective = style == "subjective" or _is_subjective_query(prompt)
+    tone = str(interpretation.get("tone") or "").strip().lower()
+    if tone not in ("neutral", "curious", "enthusiastic"):
+        tone = "curious" if subjective else "neutral"
+    allow_list = bool(interpretation.get("allow_list"))
+    focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt)
+    if not focus_tags and subjective:
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+    avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
+
+    angles = _open_ended_plan(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        count=5,
+        state=state,
+        step=2,
+        model=model,
+    )
+    if state and avoid_tags:
+        state.update("planning", step=2, note=f"avoiding {', '.join(sorted(avoid_tags))}")
+
+    insights = _open_ended_insights(
+        prompt,
+        fact_pack=fact_pack,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        count=7,
+        state=state,
+        step=3,
+        model=model,
+    )
+    ranked_insights = _rank_insights(
+        insights,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        count=3,
+    )
+
+    candidates: list[dict[str, Any]] = []
+    step = 4
+    for insight in ranked_insights:
+        candidates.append(
+            _open_ended_candidate(
+                prompt,
+                focus=insight.get("summary") or "insight",
+                fact_pack=fact_pack,
+                history_lines=history_lines,
+                subjective=subjective,
+                tone=str(tone),
+                allow_list=allow_list,
+                state=state,
+                step=step,
+                fact_hints=insight.get("fact_ids") or [],
+                model=model,
+            )
+        )
+        step += 1
+    if not candidates and angles:
+        for angle in angles[:2]:
+            angle_tags = set(angle.get("tags") or []) or _tags_from_text(angle.get("focus") or "")
+            fact_ids = _open_ended_select_facts(
+                prompt,
+                fact_pack=fact_pack,
+                fact_meta=fact_meta,
+                history_lines=history_lines,
+                focus_tags=angle_tags or focus_tags,
+                avoid_tags=avoid_tags,
+                avoid_fact_ids=[],
+                count=4,
+                subjective=subjective,
+                state=state,
+                step=step,
+                model=model,
+            )
+            candidates.append(
+                _open_ended_candidate(
+                    prompt,
+                    focus=angle.get("focus") or "alternate angle",
+                    fact_pack=fact_pack,
+                    history_lines=history_lines,
+                    subjective=subjective,
+                    tone=str(tone),
+                    allow_list=allow_list,
+                    state=state,
+                    step=step,
+                    fact_hints=fact_ids,
+                    model=model,
+                )
+            )
+            step += 1
+            if len(candidates) >= 2:
+                break
+
+    if state:
+        state.update("evaluating", step=step, note="ranking candidates")
+    selected = _select_candidates(candidates, count=2)
+    step += 1
+    critique = _open_ended_critique(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        candidates=selected or candidates,
+        state=state,
+        step=step,
+        model=model,
+    )
+    step += 1
+    reply = _open_ended_synthesize(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        candidates=selected or candidates,
+        subjective=subjective,
+        tone=str(tone),
+        allow_list=allow_list,
+        state=state,
+        step=step,
+        model=model,
+        critique=critique,
+    )
+    if state:
+        state.update("done", step=total_steps)
+    return reply
+
+
+def _open_ended_total_steps(mode: str) -> int:
+    if mode == "fast":
+        return 2
+    return 9
+
+
+def _fast_fact_lines(
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    *,
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    primary_tags: set[str] | None = None,
+    limit: int = 10,
+) -> list[str]:
+    if not fact_lines:
+        return []
+    primary_tags = primary_tags or set()
+    scored: list[tuple[int, int, str]] = []
+    priority_map = {tag: idx for idx, tag in enumerate(_SUBJECTIVE_TAG_PRIORITY)}
+    use_priority = not primary_tags and focus_tags == _ALLOWED_INSIGHT_TAGS
+    for idx, line in enumerate(fact_lines):
+        fid = f"F{idx + 1}"
+        tags = set(fact_meta.get(fid, {}).get("tags") or [])
+        if avoid_tags and (avoid_tags & tags):
+            continue
+        score = 0
+        if primary_tags:
+            score += 4 * len(tags & primary_tags)
+        if focus_tags:
+            score += 2 * len(tags & focus_tags)
+        if use_priority and tags:
+            bonus = 0
+            for tag in tags:
+                if tag in priority_map:
+                    bonus = max(bonus, len(priority_map) - priority_map[tag])
+            score += bonus
+        scored.append((score, idx, line))
+    scored.sort(key=lambda item: (-item[0], item[1]))
+    selected: list[str] = []
+    for score, _, line in scored:
+        if score <= 0 and selected:
+            break
+        if score > 0:
+            selected.append(line)
+        if len(selected) >= limit:
+            break
+    if not selected:
+        selected = [line for _, _, line in scored[:limit]]
+    elif len(selected) < limit:
+        for _, _, line in scored:
+            if line in selected:
+                continue
+            selected.append(line)
+            if len(selected) >= limit:
+                break
+    return selected
+
+
+def _has_body_lines(answer: str) -> bool:
+    lines = [line.strip() for line in (answer or "").splitlines() if line.strip()]
+    for line in lines:
+        lowered = line.lower()
+        if lowered.startswith("confidence"):
+            continue
+        if lowered.startswith("relevance"):
+            continue
+        if lowered.startswith("satisfaction"):
+            continue
+        if lowered.startswith("hallucinationrisk"):
+            continue
+        if lowered.startswith("hallucination risk"):
+            continue
+        return True
+    return False
+
+
+def _fallback_fact_answer(prompt: str, context: str) -> str:
+    facts: list[str] = []
+    parsed_facts: list[tuple[str, str | None, str | None]] = []
+    q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
+    for line in (context or "").splitlines():
+        trimmed = line.strip()
+        if not trimmed:
+            continue
+        if trimmed.startswith("F"):
+            match = re.match(r"^F\d+.*?\]:\s*(.*)$", trimmed)
+            if not match:
+                match = re.match(r"^F\d+:\s*(.*)$", trimmed)
+            if not match:
+                continue
+            fact = match.group(1).strip()
+        else:
+            if trimmed.lower().startswith("fact pack") or trimmed.lower().startswith("facts"):
+                continue
+            if trimmed.startswith("-"):
+                fact = trimmed.lstrip("-").strip()
+            else:
+                fact = trimmed
+        if fact.startswith("-"):
+            fact = fact.lstrip("-").strip()
+        if fact and (":" in fact or "=" in fact):
+            facts.append(fact)
+            key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact)
+            if not key_match:
+                key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact)
+            if key_match:
+                parsed_facts.append((fact, key_match.group(1).strip(), key_match.group(2).strip()))
+            else:
+                parsed_facts.append((fact, None, None))
+    if not facts:
+        return ""
+
+    def _norm_key(text: str) -> str:
+        return normalize_query(text).replace(" ", "_")
+
+    def _find_value(target: str) -> str | None:
+        for _fact, key, val in parsed_facts:
+            if key and _norm_key(key) == target:
+                return val
+        return None
+
+    def _parse_counts(text: str) -> dict[str, int]:
+        counts: dict[str, int] = {}
+        for part in (text or "").split(","):
+            if "=" not in part:
+                continue
+            k, v = part.split("=", 1)
+            k = k.strip()
+            v = v.strip()
+            if not k or not v:
+                continue
+            try:
+                counts[k] = int(float(v))
+            except ValueError:
+                continue
+        return counts
+
+    def _parse_map(text: str) -> dict[str, str]:
+        mapping: dict[str, str] = {}
+        pattern = re.compile(r"(\w+)\s*=\s*([^=]+?)(?=(?:\s*,\s*\w+\s*=)|$)")
+        for match in pattern.finditer(text or ""):
+            mapping[match.group(1).strip()] = match.group(2).strip().strip(",")
+        return mapping
+
+    list_intent = _is_list_prompt(prompt) or "name" in tokens
+    count_intent = _is_quantitative_prompt(prompt) and ("how many" in q or "count" in tokens or "number" in tokens)
+    hottest_intent = any(word in q for word in ("hottest", "highest", "most", "top", "busiest"))
+    metric = _detect_metric(q)
+    include_hw, _exclude_hw = _detect_hardware_filters(q)
+
+    if hottest_intent and metric in {"cpu", "ram", "net", "io"}:
+        hottest_val = _find_value(f"hottest_{metric}")
+        if hottest_val:
+            return f"Hottest {metric} is {hottest_val}."
+    if hottest_intent and tokens & {"postgres", "database", "db", "connections"}:
+        hottest_db = _find_value("postgres_hottest_db")
+        if hottest_db:
+            return f"Hottest database is {hottest_db}."
+
+    if count_intent and tokens & {"pods", "pod"}:
+        pending = _find_value("pods_pending")
+        failed = _find_value("pods_failed")
+        running = _find_value("pods_running")
+        succeeded = _find_value("pods_succeeded")
+        if "pending" in q and "failed" in q:
+            try:
+                total = float(pending or 0) + float(failed or 0)
+                return f"Pods pending or failed: {total:.0f}."
+            except ValueError:
+                pass
+        if "pending" in q and pending is not None:
+            return f"Pods pending is {pending}."
+        if "failed" in q and failed is not None:
+            return f"Pods failed is {failed}."
+        if "succeeded" in q and succeeded is not None:
+            return f"Pods succeeded is {succeeded}."
+        if "running" in q and running is not None:
+            return f"Pods running is {running}."
+
+    if count_intent and tokens & {"nodes", "node"} and "not ready" in q:
+        nodes_total = _find_value("nodes_total")
+        if nodes_total and "not_ready" in nodes_total:
+            match = re.search(r"not_ready=([0-9.]+)", nodes_total)
+            if match:
+                return f"Not ready nodes: {match.group(1)}."
+
+    if count_intent and include_hw:
+        counts_line = _find_value("nodes_by_hardware_count")
+        if counts_line:
+            counts = _parse_counts(counts_line)
+            for hw in include_hw:
+                if hw in counts:
+                    return f"{hw} nodes: {counts[hw]}."
+        for hw in include_hw:
+            hw_line = _find_value(hw)
+            if hw_line:
+                items = [item.strip() for item in hw_line.split(",") if item.strip()]
+                return f"{hw} nodes: {len(items)}."
+
+    if list_intent and include_hw:
+        if "control" in q:
+            cp_by_hw = _find_value("control_plane_by_hardware")
+            if cp_by_hw:
+                mapping = _parse_map(cp_by_hw)
+                for hw in include_hw:
+                    if hw in mapping:
+                        return f"{hw} control-plane nodes: {mapping[hw]}."
+            cp_nodes = _find_value("control_plane_nodes")
+            if cp_nodes:
+                return f"Control-plane nodes: {cp_nodes}."
+        for hw in include_hw:
+            hw_line = _find_value(hw)
+            if hw_line:
+                return f"{hw} nodes: {hw_line}."
+
+    if list_intent and "control" in q:
+        cp_nodes = _find_value("control_plane_nodes")
+        if cp_nodes:
+            return f"Control-plane nodes: {cp_nodes}."
+
+    preferred = tokens & {
+        "node",
+        "nodes",
+        "pod",
+        "pods",
+        "postgres",
+        "db",
+        "database",
+        "namespace",
+        "workload",
+        "worker",
+        "workers",
+        "cpu",
+        "ram",
+        "memory",
+        "net",
+        "network",
+        "io",
+        "disk",
+        "connection",
+        "connections",
+    }
+    best_fact = ""
+    best_score = -1
+    for fact in facts:
+        key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact)
+        if not key_match:
+            key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact)
+        key_tokens: set[str] = set()
+        if key_match:
+            key_tokens = set(_tokens(key_match.group(1)))
+        score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens)
+        if preferred:
+            score += 3 * len(preferred & key_tokens)
+            if not (preferred & key_tokens):
+                score -= 1
+        if list_intent and key_match and "count" in key_tokens:
+            score -= 3
+        if score > best_score:
+            best_score = score
+            best_fact = fact
+    if best_score <= 0:
+        return ""
+    key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", best_fact)
+    if not key_match:
+        key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", best_fact)
+    if key_match:
+        key = key_match.group(1).strip().replace("_", " ")
+        val = key_match.group(2).strip()
+        sentence = f"{key.capitalize()} is {val}"
+    else:
+        sentence = f"Based on the snapshot, {best_fact}"
+    if not sentence.endswith((".", "!", "?")):
+        sentence += "."
+    return sentence
+
+
+def _is_quantitative_prompt(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    if not q:
+        return False
+    tokens = set(_tokens(prompt))
+    if "how many" in q or "count" in tokens or "total" in tokens:
+        return True
+    if tokens & {"highest", "lowest", "hottest", "most", "least"}:
+        return True
+    return False
+
+
+def _is_list_prompt(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    if not q:
+        return False
+    if any(phrase in q for phrase in ("list", "names", "name", "show")):
+        return True
+    if any(phrase in q for phrase in ("which nodes", "what nodes", "what are the nodes")):
+        return True
+    return False
+
+
+def _needs_full_fact_pack(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
+    if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt):
+        return True
+    if tokens & {"workload", "pods", "namespace", "worker", "workers"}:
+        return True
+    if tokens & {"arch", "architecture", "hardware"}:
+        return True
+    if tokens & METRIC_HINT_WORDS:
+        return True
+    if _NAME_INDEX and tokens & _NAME_INDEX:
+        return True
+    if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")):
+        return True
+    return False
+
+
+def _open_ended_fast_single(
+    prompt: str,
+    *,
+    context: str,
+    history_lines: list[str] | None = None,
+    state: ThoughtState | None = None,
+    model: str,
+) -> str:
+    if state:
+        state.update("drafting", step=1, note="summarizing")
+    working_context = _append_history_context(context, history_lines or []) if history_lines else context
+    reply = _ollama_call(
+        ("atlasbot_fast", "atlasbot_fast"),
+        prompt,
+        context=working_context,
+        use_history=False,
+        system_override=_open_ended_system(),
+        model=model,
+    )
+    if not _has_body_lines(reply):
+        reply = _ollama_call(
+            ("atlasbot_fast", "atlasbot_fast"),
+            prompt + " Provide one clear sentence before the score lines.",
+            context=working_context,
+            use_history=False,
+            system_override=_open_ended_system(),
+            model=model,
+        )
+    fallback = _fallback_fact_answer(prompt, context)
+    if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
+        reply = fallback
+    if not _has_body_lines(reply):
+        reply = "I don't have enough data in the current snapshot to answer that."
+    if state:
+        state.update("done", step=_open_ended_total_steps("fast"))
+    return _ensure_scores(reply)
+
+
+def _open_ended_fast(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    state: ThoughtState | None = None,
+) -> str:
+    model = _model_for_mode("fast")
+    subjective = _is_subjective_query(prompt)
+    primary_tags = _primary_tags_for_prompt(prompt)
+    focus_tags = _preferred_tags_for_prompt(prompt)
+    if not focus_tags and subjective:
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+    avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
+    selected_lines = _fast_fact_lines(
+        fact_lines,
+        fact_meta,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        primary_tags=primary_tags,
+    )
+    selected_meta = _fact_pack_meta(selected_lines)
+    selected_pack = _fact_pack_text(selected_lines, selected_meta)
+    if _needs_full_fact_pack(prompt) or not selected_lines:
+        selected_pack = fact_pack
+    if not subjective and _needs_full_fact_pack(prompt):
+        fallback = _fallback_fact_answer(prompt, fact_pack)
+        if fallback:
+            return _ensure_scores(fallback)
+    if state:
+        state.total_steps = _open_ended_total_steps("fast")
+    return _open_ended_fast_single(
+        prompt,
+        context=selected_pack,
+        history_lines=history_lines,
+        state=state,
+        model=model,
+    )
+
+
+def _open_ended_deep(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    state: ThoughtState | None = None,
+) -> str:
+    return _open_ended_multi(
+        prompt,
+        fact_pack=fact_pack,
+        fact_lines=fact_lines,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        state=state,
+    )
+
+
+def open_ended_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]],
+    history_lines: list[str],
+    mode: str,
+    allow_tools: bool,
+    state: ThoughtState | None = None,
+) -> str:
+    lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if _knowledge_intent(prompt) or _doc_intent(prompt):
+        kb_detail = kb_retrieve(prompt)
+        if kb_detail:
+            for line in kb_detail.splitlines():
+                if line.strip():
+                    lines.append(line.strip())
+    tool_lines = _tool_fact_lines(prompt, allow_tools=allow_tools)
+    if tool_lines:
+        lines.extend(tool_lines)
+    if not lines:
+        return _ensure_scores("I don't have enough data to answer that.")
+    fact_meta = _fact_pack_meta(lines)
+    fact_pack = _fact_pack_text(lines, fact_meta)
+    if mode == "fast":
+        return _open_ended_fast(
+            prompt,
+            fact_pack=fact_pack,
+            fact_lines=lines,
+            fact_meta=fact_meta,
+            history_lines=history_lines,
+            state=state,
+        )
+    return _open_ended_deep(
+        prompt,
+        fact_pack=fact_pack,
+        fact_lines=lines,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        state=state,
+    )
+
+
+def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> str:
+    system = (
+        "System: You are Atlas, a helpful general assistant. "
+        "Answer using common knowledge when possible, and say when you're unsure. "
+        "Be concise and avoid unnecessary caveats. "
+        "Respond in plain sentences (no lists unless asked). "
+        "End every response with a line: 'Confidence: high|medium|low'."
+    )
+    model = _model_for_mode(mode)
+    context = _append_history_context("", history_lines) if history_lines else ""
+    reply = _ollama_call(
+        ("general", "reply"),
+        prompt,
+        context=context,
+        use_history=False,
+        system_override=system,
+        model=model,
+    )
+    reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
+    return _ensure_scores(reply)
+
+
+# Internal HTTP endpoint for cluster answers (website uses this).
+class _AtlasbotHandler(BaseHTTPRequestHandler):
+    server_version = "AtlasbotHTTP/1.0"
+
+    def _write_json(self, status: int, payload: dict[str, Any]):
+        body = json.dumps(payload).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def _authorized(self) -> bool:
+        if not ATLASBOT_INTERNAL_TOKEN:
+            return True
+        token = self.headers.get("X-Internal-Token", "")
+        return token == ATLASBOT_INTERNAL_TOKEN
+
+    def do_GET(self):  # noqa: N802
+        if self.path == "/health":
+            self._write_json(200, {"status": "ok"})
+            return
+        self._write_json(404, {"error": "not_found"})
+
+    def do_POST(self):  # noqa: N802
+        if self.path != "/v1/answer":
+            self._write_json(404, {"error": "not_found"})
+            return
+        if not self._authorized():
+            self._write_json(401, {"error": "unauthorized"})
+            return
+        try:
+            length = int(self.headers.get("Content-Length", "0"))
+        except ValueError:
+            length = 0
+        raw = self.rfile.read(length) if length > 0 else b""
+        try:
+            payload = json.loads(raw.decode("utf-8")) if raw else {}
+        except json.JSONDecodeError:
+            self._write_json(400, {"error": "invalid_json"})
+            return
+        prompt = str(payload.get("prompt") or payload.get("question") or "").strip()
+        if not prompt:
+            self._write_json(400, {"error": "missing_prompt"})
+            return
+        cleaned = _strip_bot_mention(prompt)
+        mode = str(payload.get("mode") or "deep").lower()
+        if mode in ("quick", "fast"):
+            mode = "fast"
+        elif mode in ("smart", "deep"):
+            mode = "deep"
+        else:
+            mode = "deep"
+        snapshot = _snapshot_state()
+        inventory = _snapshot_inventory(snapshot) or node_inventory_live()
+        workloads = _snapshot_workloads(snapshot)
+        history_payload = payload.get("history") or []
+        history_lines = _history_payload_lines(history_payload)
+        history_cluster = _history_mentions_cluster(
+            history_lines,
+            inventory=inventory,
+            workloads=workloads,
+        )
+        followup = _is_followup_query(cleaned)
+        cleaned_q = normalize_query(cleaned)
+        cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
+        subjective = _is_subjective_query(cleaned)
+        followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS)
+        contextual = history_cluster and (followup or followup_affinity)
+        cluster_query = cluster_affinity or contextual
+        context = ""
+        if cluster_query:
+            context = build_context(
+                cleaned,
+                allow_tools=True,
+                targets=[],
+                inventory=inventory,
+                snapshot=snapshot,
+                workloads=workloads,
+            )
+        if cluster_query:
+            answer = open_ended_answer(
+                cleaned,
+                inventory=inventory,
+                snapshot=snapshot,
+                workloads=workloads,
+                history_lines=history_lines,
+                mode=mode,
+                allow_tools=True,
+                state=None,
+            )
+        else:
+            answer = _non_cluster_reply(cleaned, history_lines=history_lines, mode=mode)
+        self._write_json(200, {"answer": answer})
+
+
+def _start_http_server():
+    server = HTTPServer(("0.0.0.0", ATLASBOT_HTTP_PORT), _AtlasbotHandler)
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+
 
 # Conversation state.
 history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (short transcript)
@@ -449,17 +4405,56 @@ history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (
 def key_for(room_id: str, sender: str, is_dm: bool):
     return (room_id, None) if is_dm else (room_id, sender)
 
-def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str:
+
+def _history_mentions_cluster(
+    history_lines: list[str],
+    *,
+    inventory: list[dict[str, Any]] | None = None,
+    workloads: list[dict[str, Any]] | None = None,
+) -> bool:
+    recent = [line for line in history_lines[-8:] if isinstance(line, str)]
+    for line in recent:
+        cleaned = normalize_query(line)
+        if not cleaned:
+            continue
+        if _is_cluster_query(cleaned, inventory=inventory, workloads=workloads):
+            return True
+    return False
+
+def build_context(
+    prompt: str,
+    *,
+    allow_tools: bool,
+    targets: list[tuple[str, str]],
+    inventory: list[dict[str, Any]] | None = None,
+    snapshot: dict[str, Any] | None = None,
+    workloads: list[dict[str, Any]] | None = None,
+) -> str:
     parts: list[str] = []
 
-    kb = kb_retrieve(prompt)
-    if kb:
-        parts.append(kb)
+    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if facts:
+        parts.append(facts)
+
+    snapshot_json = snapshot_compact_context(
+        prompt,
+        snapshot,
+        inventory=inventory,
+        workloads=workloads,
+    )
+    if snapshot_json:
+        parts.append(snapshot_json)
 
     endpoints, edges = catalog_hints(prompt)
     if endpoints:
         parts.append(endpoints)
 
+    kb = kb_retrieve(prompt)
+    if not kb and _knowledge_intent(prompt):
+        kb = kb_retrieve_titles(prompt, limit=4)
+    if kb:
+        parts.append(kb)
+
     if allow_tools:
         # Scope pod summaries to relevant namespaces/workloads when possible.
         prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set)
@@ -478,44 +4473,352 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st
         if flux_bad:
             parts.append("Flux (not ready):\n" + flux_bad)
 
-        p_l = (prompt or "").lower()
-        if any(w in p_l for w in METRIC_HINT_WORDS):
-            restarts = vm_top_restarts(1)
-            if restarts:
-                parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
-            snap = vm_cluster_snapshot()
-            if snap:
-                parts.append("VictoriaMetrics (cluster snapshot):\n" + snap)
-
     return "\n\n".join([p for p in parts if p]).strip()
 
-def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
-    try:
-        system = (
-            "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
-            "Be helpful, direct, and concise. "
-            "Prefer answering with exact repo paths and Kubernetes resource names. "
-            "Never include or request secret values."
-        )
-        transcript_parts = [system]
-        if context:
-            transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
-        transcript_parts.extend(history[hist_key][-24:])
-        transcript_parts.append(f"User: {prompt}")
-        transcript = "\n".join(transcript_parts)
 
-        payload = {"model": MODEL, "message": transcript}
-        headers = {"Content-Type": "application/json"}
-        if API_KEY:
-            headers["x-api-key"] = API_KEY
-        r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-        with request.urlopen(r, timeout=20) as resp:
-            data = json.loads(resp.read().decode())
-            reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help."
+def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str:
+    if not snapshot:
+        return ""
+    metrics = _snapshot_metrics(snapshot)
+    workloads = _snapshot_workloads(snapshot)
+    q = normalize_query(prompt)
+    parts: list[str] = []
+    nodes = snapshot.get("nodes") if isinstance(snapshot.get("nodes"), dict) else {}
+    if nodes.get("total") is not None:
+        parts.append(
+            f"Snapshot: nodes_total={nodes.get('total')}, ready={nodes.get('ready')}, not_ready={nodes.get('not_ready')}."
+        )
+    if any(word in q for word in ("postgres", "connections", "db")):
+        postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+        if postgres:
+            parts.append(f"Snapshot: postgres_connections={postgres}.")
+    if any(word in q for word in ("hottest", "cpu", "ram", "memory", "net", "network", "io", "disk")):
+        hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+        if hottest:
+            parts.append(f"Snapshot: hottest_nodes={hottest}.")
+    if workloads and any(word in q for word in ("run", "running", "host", "node", "where", "which")):
+        match = _select_workload(prompt, workloads)
+        if match:
+            parts.append(f"Snapshot: workload={match}.")
+    return "\n".join(parts).strip()
+
+def _compact_nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]:
+    details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
+    output: list[dict[str, Any]] = []
+    for node in details:
+        if not isinstance(node, dict):
+            continue
+        name = node.get("name")
+        if not name:
+            continue
+        output.append(
+            {
+                "name": name,
+                "ready": node.get("ready"),
+                "hardware": node.get("hardware"),
+                "arch": node.get("arch"),
+                "roles": node.get("roles"),
+                "is_worker": node.get("is_worker"),
+                "os": node.get("os"),
+                "kernel": node.get("kernel"),
+                "kubelet": node.get("kubelet"),
+                "container_runtime": node.get("container_runtime"),
+            }
+        )
+    return output
+
+def _compact_metrics(snapshot: dict[str, Any]) -> dict[str, Any]:
+    metrics = snapshot.get("metrics") if isinstance(snapshot.get("metrics"), dict) else {}
+    return {
+        "pods_running": metrics.get("pods_running"),
+        "pods_pending": metrics.get("pods_pending"),
+        "pods_failed": metrics.get("pods_failed"),
+        "pods_succeeded": metrics.get("pods_succeeded"),
+        "postgres_connections": metrics.get("postgres_connections"),
+        "hottest_nodes": metrics.get("hottest_nodes"),
+        "node_usage": metrics.get("node_usage"),
+        "top_restarts_1h": metrics.get("top_restarts_1h"),
+    }
+
+def snapshot_compact_context(
+    prompt: str,
+    snapshot: dict[str, Any] | None,
+    *,
+    inventory: list[dict[str, Any]] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> str:
+    if not snapshot:
+        return ""
+    compact = {
+        "collected_at": snapshot.get("collected_at"),
+        "nodes_summary": snapshot.get("nodes_summary"),
+        "expected_workers": expected_worker_nodes_from_metrics(),
+        "nodes_detail": _compact_nodes_detail(snapshot),
+        "workloads": _workloads_for_prompt(prompt, workloads or [], limit=40) if workloads else [],
+        "metrics": _compact_metrics(snapshot),
+        "flux": snapshot.get("flux"),
+        "errors": snapshot.get("errors"),
+    }
+    text = json.dumps(compact, ensure_ascii=False)
+    if len(text) > MAX_FACTS_CHARS:
+        text = text[: MAX_FACTS_CHARS - 3].rstrip() + "..."
+    return "Cluster snapshot (JSON):\n" + text
+
+
+def _knowledge_intent(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    return any(
+        phrase in q
+        for phrase in (
+            "what do you know",
+            "tell me about",
+            "interesting",
+            "overview",
+            "summary",
+            "describe",
+            "explain",
+        )
+    )
+
+
+def _is_cluster_query(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> bool:
+    q = normalize_query(prompt)
+    if not q:
+        return False
+    if TITAN_NODE_RE.search(q):
+        return True
+    if any(word in q for word in CLUSTER_HINT_WORDS):
+        return True
+    if any(word in q for word in METRIC_HINT_WORDS):
+        return True
+    for host_match in HOST_RE.finditer(q):
+        host = host_match.group(1).lower()
+        if host.endswith("bstein.dev"):
+            return True
+    tokens = set(_tokens(q))
+    if _NAME_INDEX and tokens & _NAME_INDEX:
+        return True
+    return False
+
+
+def _inventory_summary(inventory: list[dict[str, Any]]) -> str:
+    if not inventory:
+        return ""
+    groups = _group_nodes(inventory)
+    total = len(inventory)
+    ready = [n for n in inventory if n.get("ready") is True]
+    not_ready = [n for n in inventory if n.get("ready") is False]
+    parts = [f"Atlas cluster: {total} nodes ({len(ready)} ready, {len(not_ready)} not ready)."]
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        nodes = groups.get(key) or []
+        if nodes:
+            parts.append(f"- {key}: {len(nodes)} nodes ({', '.join(nodes)})")
+    return "\n".join(parts)
+
+
+def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str:
+    parts: list[str] = []
+    inv = _inventory_summary(inventory)
+    if inv:
+        parts.append(inv)
+    kb_titles = kb_retrieve_titles(prompt, limit=4)
+    if kb_titles:
+        parts.append(kb_titles)
+    summary = "\n".join(parts).strip()
+    return _format_confidence(summary, "medium") if summary else ""
+
+def _ollama_call(
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    use_history: bool = True,
+    system_override: str | None = None,
+    model: str | None = None,
+) -> str:
+    system = system_override or (
+        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
+        "Be helpful, direct, and concise. "
+        "Use the provided context and facts as your source of truth. "
+        "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. "
+        "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). "
+        "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. "
+        "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
+        "For subjective prompts (interesting, favorite, unconventional), pick one or two observations from the context, explain why they stand out in 1-2 sentences, and avoid repeating the same observation as the last response if you can. "
+        "Prefer exact repo paths and Kubernetes resource names when relevant. "
+        "Never include or request secret values. "
+        "Do not suggest commands unless explicitly asked. "
+        "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
+        "Translate metrics into natural language instead of echoing raw label/value pairs. "
+        "When providing counts or totals, use the exact numbers from the context; do not invent or truncate. "
+        "Avoid bare lists unless the user asked for a list; weave numbers into sentences. "
+        "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. "
+        "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. "
+        "If the answer is not grounded in the provided context or tool data, say you do not know. "
+        "End every response with a line: 'Confidence: high|medium|low'."
+    )
+    endpoint = _ollama_endpoint()
+    if not endpoint:
+        raise RuntimeError("ollama endpoint missing")
+    messages: list[dict[str, str]] = [{"role": "system", "content": system}]
+    if context:
+        messages.append({"role": "user", "content": "Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]})
+    if use_history:
+        messages.extend(_history_to_messages(history[hist_key][-24:]))
+    messages.append({"role": "user", "content": prompt})
+
+    model_name = model or MODEL
+    payload = {"model": model_name, "messages": messages, "stream": False}
+    headers = {"Content-Type": "application/json"}
+    if API_KEY:
+        headers["x-api-key"] = API_KEY
+    r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
+    lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None
+    if lock:
+        lock.acquire()
+    try:
+        try:
+            with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+                data = json.loads(resp.read().decode())
+        except error.HTTPError as exc:
+            if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
+                payload["model"] = FALLBACK_MODEL
+                r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
+                with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+                    data = json.loads(resp.read().decode())
+            else:
+                raise
+        msg = data.get("message") if isinstance(data, dict) else None
+        if isinstance(msg, dict):
+            raw_reply = msg.get("content")
+        else:
+            raw_reply = data.get("response") or data.get("reply") or data
+        reply = _normalize_reply(raw_reply) or "I'm here to help."
+        if use_history:
             history[hist_key].append(f"Atlas: {reply}")
-            return reply
-    except Exception:
-        return "I’m here — but I couldn’t reach the model backend."
+        return reply
+    finally:
+        if lock:
+            lock.release()
+
+def ollama_reply(
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    fallback: str = "",
+    use_history: bool = True,
+    model: str | None = None,
+) -> str:
+    last_error = None
+    for attempt in range(max(1, OLLAMA_RETRIES + 1)):
+        try:
+            return _ollama_call(
+                hist_key,
+                prompt,
+                context=context,
+                use_history=use_history,
+                model=model,
+            )
+        except Exception as exc:  # noqa: BLE001
+            last_error = exc
+            time.sleep(min(4, 2 ** attempt))
+    if fallback:
+        if use_history:
+            history[hist_key].append(f"Atlas: {fallback}")
+        return fallback
+    return "I don't have enough data to answer that."
+
+def ollama_reply_with_thinking(
+    token: str,
+    room: str,
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    fallback: str,
+    use_history: bool = True,
+    model: str | None = None,
+) -> str:
+    result: dict[str, str] = {"reply": ""}
+    done = threading.Event()
+
+    def worker():
+        result["reply"] = ollama_reply(
+            hist_key,
+            prompt,
+            context=context,
+            fallback=fallback,
+            use_history=use_history,
+            model=model,
+        )
+        done.set()
+
+    thread = threading.Thread(target=worker, daemon=True)
+    thread.start()
+    if not done.wait(2.0):
+        send_msg(token, room, "Thinking…")
+        prompt_hint = " ".join((prompt or "").split())
+        if len(prompt_hint) > 160:
+            prompt_hint = prompt_hint[:157] + "…"
+        heartbeat = max(10, THINKING_INTERVAL_SEC)
+        next_heartbeat = time.monotonic() + heartbeat
+        while not done.wait(max(0, next_heartbeat - time.monotonic())):
+            if prompt_hint:
+                send_msg(token, room, f"Still thinking about: {prompt_hint} (gathering context)")
+            else:
+                send_msg(token, room, "Still thinking (gathering context)…")
+            next_heartbeat += heartbeat
+    thread.join(timeout=1)
+    return result["reply"] or fallback or "Model backend is busy. Try again in a moment."
+
+
+def open_ended_with_thinking(
+    token: str,
+    room: str,
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]],
+    history_lines: list[str],
+    mode: str,
+    allow_tools: bool,
+) -> str:
+    result: dict[str, str] = {"reply": ""}
+    done = threading.Event()
+    total_steps = _open_ended_total_steps(mode)
+    state = ThoughtState(total_steps=total_steps)
+
+    def worker():
+        result["reply"] = open_ended_answer(
+            prompt,
+            inventory=inventory,
+            snapshot=snapshot,
+            workloads=workloads,
+            history_lines=history_lines,
+            mode=mode,
+            allow_tools=allow_tools,
+            state=state,
+        )
+        done.set()
+
+    thread = threading.Thread(target=worker, daemon=True)
+    thread.start()
+    if not done.wait(2.0):
+        send_msg(token, room, "Thinking…")
+        heartbeat = max(10, THINKING_INTERVAL_SEC)
+        next_heartbeat = time.monotonic() + heartbeat
+        while not done.wait(max(0, next_heartbeat - time.monotonic())):
+            send_msg(token, room, state.status_line())
+            next_heartbeat += heartbeat
+    thread.join(timeout=1)
+    return result["reply"] or "Model backend is busy. Try again in a moment."
 
 def sync_loop(token: str, room_id: str):
     since = None
@@ -569,7 +4872,11 @@ def sync_loop(token: str, room_id: str):
                 if not (is_dm or mentioned):
                     continue
 
-                # Only do live cluster/metrics introspection in DMs.
+                cleaned_body = _strip_bot_mention(body)
+                lower_body = cleaned_body.lower()
+                mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
+
+                # Only do live cluster introspection in DMs.
                 allow_tools = is_dm
 
                 promql = ""
@@ -580,7 +4887,7 @@ def sync_loop(token: str, room_id: str):
 
                 # Attempt to scope tools to the most likely workloads when hostnames are mentioned.
                 targets: list[tuple[str, str]] = []
-                for m in HOST_RE.finditer(body.lower()):
+                for m in HOST_RE.finditer(lower_body):
                     host = m.group(1).lower()
                     for ep in _HOST_INDEX.get(host, []):
                         backend = ep.get("backend") or {}
@@ -589,14 +4896,60 @@ def sync_loop(token: str, room_id: str):
                             if isinstance(w, dict) and w.get("name"):
                                 targets.append((ns, str(w["name"])))
 
-                context = build_context(body, allow_tools=allow_tools, targets=targets)
+                snapshot = _snapshot_state()
+                inventory = node_inventory_for_prompt(cleaned_body)
+                if not inventory:
+                    inventory = _snapshot_inventory(snapshot)
+                workloads = _snapshot_workloads(snapshot)
+                history_cluster = _history_mentions_cluster(
+                    history[hist_key],
+                    inventory=inventory,
+                    workloads=workloads,
+                )
+                followup = _is_followup_query(cleaned_body)
+                cleaned_q = normalize_query(cleaned_body)
+                cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
+                subjective = _is_subjective_query(cleaned_body)
+                followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS)
+                contextual = history_cluster and (followup or followup_affinity)
+                cluster_query = cluster_affinity or contextual
+                context = ""
+                if cluster_query:
+                    context = build_context(
+                        cleaned_body,
+                        allow_tools=allow_tools,
+                        targets=targets,
+                        inventory=inventory,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                    )
                 if allow_tools and promql:
                     res = vm_query(promql, timeout=20)
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
-                    context = (context + "\n\n" + extra).strip() if context else extra
-                reply = ollama_reply(hist_key, body, context=context)
+                    send_msg(token, rid, extra)
+                    continue
+                if cluster_query:
+                    reply = open_ended_with_thinking(
+                        token,
+                        rid,
+                        cleaned_body,
+                        inventory=inventory,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                        history_lines=history[hist_key],
+                        mode=mode if mode in ("fast", "deep") else "deep",
+                        allow_tools=allow_tools,
+                    )
+                else:
+                    reply = _non_cluster_reply(
+                        cleaned_body,
+                        history_lines=history[hist_key],
+                        mode=mode if mode in ("fast", "deep") else "deep",
+                    )
                 send_msg(token, rid, reply)
+                history[hist_key].append(f"Atlas: {reply}")
+                history[hist_key] = history[hist_key][-80:]
 
 def login_with_retry():
     last_err = None
@@ -610,6 +4963,7 @@ def login_with_retry():
 
 def main():
     load_kb()
+    _start_http_server()
     token = login_with_retry()
     try:
         room_id = resolve_alias(token, ROOM_ALIAS)
diff --git a/services/comms/secretproviderclass.yaml b/services/comms/secretproviderclass.yaml
index 69d4b2b..0a89552 100644
--- a/services/comms/secretproviderclass.yaml
+++ b/services/comms/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "comms"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/comms"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/crypto/xmr-miner/secretproviderclass.yaml b/services/crypto/xmr-miner/secretproviderclass.yaml
index a72097f..12e4ba1 100644
--- a/services/crypto/xmr-miner/secretproviderclass.yaml
+++ b/services/crypto/xmr-miner/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "crypto"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/crypto"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/finance/actual-budget-deployment.yaml b/services/finance/actual-budget-deployment.yaml
index 55186b2..637e9ae 100644
--- a/services/finance/actual-budget-deployment.yaml
+++ b/services/finance/actual-budget-deployment.yaml
@@ -90,6 +90,8 @@ spec:
               value: openid
             - name: ACTUAL_MULTIUSER
               value: "true"
+            - name: ACTUAL_USER_CREATION_MODE
+              value: login
             - name: ACTUAL_OPENID_DISCOVERY_URL
               value: https://sso.bstein.dev/realms/atlas
             - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT
@@ -128,6 +130,8 @@ spec:
               value: openid
             - name: ACTUAL_MULTIUSER
               value: "true"
+            - name: ACTUAL_USER_CREATION_MODE
+              value: login
             - name: ACTUAL_OPENID_DISCOVERY_URL
               value: https://sso.bstein.dev/realms/atlas
             - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT
diff --git a/services/finance/firefly-cronjob.yaml b/services/finance/firefly-cronjob.yaml
index 6c4d507..9e5c852 100644
--- a/services/finance/firefly-cronjob.yaml
+++ b/services/finance/firefly-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: finance
 spec:
   schedule: "0 3 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/finance/kustomization.yaml b/services/finance/kustomization.yaml
index e4c414f..1559f5c 100644
--- a/services/finance/kustomization.yaml
+++ b/services/finance/kustomization.yaml
@@ -9,7 +9,7 @@ resources:
   - finance-secrets-ensure-rbac.yaml
   - actual-budget-data-pvc.yaml
   - firefly-storage-pvc.yaml
-  - finance-secrets-ensure-job.yaml
+  - oneoffs/finance-secrets-ensure-job.yaml
   - actual-budget-deployment.yaml
   - firefly-deployment.yaml
   - firefly-user-sync-cronjob.yaml
diff --git a/services/finance/finance-secrets-ensure-job.yaml b/services/finance/oneoffs/finance-secrets-ensure-job.yaml
similarity index 83%
rename from services/finance/finance-secrets-ensure-job.yaml
rename to services/finance/oneoffs/finance-secrets-ensure-job.yaml
index 67f06cb..e8c8f58 100644
--- a/services/finance/finance-secrets-ensure-job.yaml
+++ b/services/finance/oneoffs/finance-secrets-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/finance/finance-secrets-ensure-job.yaml
+# services/finance/oneoffs/finance-secrets-ensure-job.yaml
+# One-off job for finance/finance-secrets-ensure-5.
+# Purpose: finance secrets ensure 5 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: finance-secrets-ensure-5
   namespace: finance
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/finance/portal-rbac.yaml b/services/finance/portal-rbac.yaml
index 2fb7ede..66eafea 100644
--- a/services/finance/portal-rbac.yaml
+++ b/services/finance/portal-rbac.yaml
@@ -29,3 +29,17 @@ subjects:
   - kind: ServiceAccount
     name: bstein-dev-home
     namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-firefly-user-sync
+  namespace: finance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-firefly-user-sync
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/gitea/deployment.yaml b/services/gitea/deployment.yaml
index 9dc0c87..da188c3 100644
--- a/services/gitea/deployment.yaml
+++ b/services/gitea/deployment.yaml
@@ -169,6 +169,8 @@ spec:
               value: "trace"
             - name: GITEA__service__REQUIRE_SIGNIN_VIEW
               value: "false"
+            - name: GITEA__webhook__ALLOWED_HOST_LIST
+              value: "ci.bstein.dev"
             - name: GITEA__server__PROXY_HEADERS
               value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host"
             - name: GITEA__session__COOKIE_SECURE
diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml
index b0cbdbd..16b81a8 100644
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@@ -391,6 +391,16 @@ spec:
                         $patch: delete
                       - name: core-writable
                         emptyDir: {}
+          - target:
+              kind: Ingress
+              name: harbor-ingress
+            patch: |-
+              - op: replace
+                path: /spec/rules/0/http/paths/2/backend/service/name
+                value: harbor-registry
+              - op: replace
+                path: /spec/rules/0/http/paths/2/backend/service/port/number
+                value: 5000
           - target:
               kind: Deployment
               name: harbor-jobservice
diff --git a/services/harbor/secretproviderclass.yaml b/services/harbor/secretproviderclass.yaml
index 03fef95..636f6fa 100644
--- a/services/harbor/secretproviderclass.yaml
+++ b/services/harbor/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "harbor"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/harbor"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/health/portal-rbac.yaml b/services/health/portal-rbac.yaml
index cd9acd1..feb7441 100644
--- a/services/health/portal-rbac.yaml
+++ b/services/health/portal-rbac.yaml
@@ -8,7 +8,7 @@ rules:
   - apiGroups: ["batch"]
     resources: ["cronjobs"]
     verbs: ["get"]
-    resourceNames: ["wger-user-sync"]
+    resourceNames: ["wger-user-sync", "wger-admin-ensure"]
   - apiGroups: ["batch"]
     resources: ["jobs"]
     verbs: ["create", "get", "list", "watch"]
@@ -29,3 +29,17 @@ subjects:
   - kind: ServiceAccount
     name: bstein-dev-home
     namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-wger-user-sync
+  namespace: health
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-wger-user-sync
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/health/wger-admin-ensure-cronjob.yaml b/services/health/wger-admin-ensure-cronjob.yaml
index db178a3..a1063dd 100644
--- a/services/health/wger-admin-ensure-cronjob.yaml
+++ b/services/health/wger-admin-ensure-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "15 3 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml
new file mode 100644
index 0000000..a9ed319
--- /dev/null
+++ b/services/jenkins/cache-pvc.yaml
@@ -0,0 +1,13 @@
+# services/jenkins/cache-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: jenkins-cache-v2
+  namespace: jenkins
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+  storageClassName: astreae
diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index ac26350..c2144fa 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -18,7 +18,7 @@ data:
           logoutFromOpenIdProvider: true
           postLogoutRedirectUrl: "https://ci.bstein.dev"
           sendScopesInTokenRequest: true
-          rootURLFromRequest: true
+          rootURLFromRequest: false
           userNameField: "preferred_username"
           fullNameFieldName: "name"
           emailFieldName: "email"
@@ -49,8 +49,15 @@ data:
     jobs:
       - script: |
           pipelineJob('harbor-arm-build') {
-            triggers {
-              scm('H/5 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/5 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
             }
             definition {
               cpsScm {
@@ -83,8 +90,15 @@ data:
             }
           }
           pipelineJob('ci-demo') {
-            triggers {
-              scm('H/1 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/1 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
             }
             definition {
               cpsScm {
@@ -102,8 +116,15 @@ data:
             }
           }
           pipelineJob('bstein-dev-home') {
-            triggers {
-              scm('H/2 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/2 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
             }
             definition {
               cpsScm {
@@ -120,9 +141,42 @@ data:
               }
             }
           }
+          pipelineJob('ariadne') {
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/2 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
+            }
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/ariadne.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/master')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
           pipelineJob('data-prepper') {
-            triggers {
-              scm('H/5 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/5 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
             }
             definition {
               cpsScm {
@@ -139,24 +193,39 @@ data:
               }
             }
           }
-          pipelineJob('titan-iac-quality-gate') {
-            triggers {
-              scm('H/5 * * * *')
-            }
-            definition {
-              cpsScm {
-                scm {
+          multibranchPipelineJob('titan-iac-quality-gate') {
+            branchSources {
+              branchSource {
+                source {
                   git {
-                    remote {
-                      url('https://scm.bstein.dev/bstein/titan-iac.git')
-                      credentials('gitea-pat')
-                    }
-                    branches('*/feature/vault-consumption')
+                    id('titan-iac-quality-gate')
+                    remote('https://scm.bstein.dev/bstein/titan-iac.git')
+                    credentialsId('gitea-pat')
                   }
                 }
+              }
+            }
+            factory {
+              workflowBranchProjectFactory {
                 scriptPath('ci/Jenkinsfile.titan-iac')
               }
             }
+            orphanedItemStrategy {
+              discardOldItems {
+                numToKeep(30)
+              }
+            }
+            triggers {
+              periodicFolderTrigger {
+                interval('12h')
+              }
+            }
+            configure { node ->
+              def webhookToken = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: ''
+              def triggers = node / 'triggers'
+              def webhook = triggers.appendNode('com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger')
+              webhook.appendNode('token', webhookToken)
+            }
           }
   base.yaml: |
     jenkins:
@@ -189,6 +258,11 @@ data:
           templates:
             - name: "default"
               namespace: "jenkins"
+              workspaceVolume:
+                dynamicPVC:
+                  accessModes: "ReadWriteOnce"
+                  requestsSize: "20Gi"
+                  storageClassName: "astreae"
               containers:
               - name: "jnlp"
                 args: "^${computer.jnlpmac} ^${computer.name}"
@@ -217,3 +291,6 @@ data:
       crumbIssuer:
         standard:
           excludeClientIPFromCrumb: true
+    unclassified:
+      location:
+        url: "https://ci.bstein.dev/"
diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml
index eabea13..1c43cfb 100644
--- a/services/jenkins/configmap-plugins.yaml
+++ b/services/jenkins/configmap-plugins.yaml
@@ -6,12 +6,17 @@ metadata:
   namespace: jenkins
 data:
   plugins.txt: |
-    kubernetes
-    workflow-aggregator
-    git
-    pipeline-utility-steps
-    configuration-as-code
-    configuration-as-code-support
-    oic-auth
-    job-dsl
-    simple-theme-plugin
+    kubernetes:4416.v2ea_b_5372da_a_e
+    workflow-aggregator:608.v67378e9d3db_1
+    git:5.8.1
+    pipeline-utility-steps:2.20.0
+    configuration-as-code:2031.veb_a_fdda_b_3ffd
+    oic-auth:4.609.v9de140f63d01
+    job-dsl:1.93
+    simple-theme-plugin:230.v8b_fd91b_b_800c
+    workflow-multibranch:821.vc3b_4ea_780798
+    branch-api:2.1268.v044a_87612da_8
+    scm-api:724.v7d839074eb_5c
+    gitea:268.v75e47974c01d
+    gitea-checks:603.621.vc708da_fb_371d
+    multibranch-scan-webhook-trigger:1.0.11
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index e846a8e..63f722b 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -22,23 +22,33 @@ spec:
         vault.hashicorp.com/role: "jenkins"
         vault.hashicorp.com/agent-inject-secret-jenkins-env: "kv/data/atlas/jenkins/jenkins-oidc"
         vault.hashicorp.com/agent-inject-template-jenkins-env: |
-          {{- with secret "kv/data/atlas/jenkins/jenkins-oidc" -}}
+          {{ with secret "kv/data/atlas/jenkins/jenkins-oidc" }}
           OIDC_CLIENT_ID={{ .Data.data.clientId }}
           OIDC_CLIENT_SECRET={{ .Data.data.clientSecret }}
           OIDC_AUTH_URL={{ .Data.data.authorizationUrl }}
           OIDC_TOKEN_URL={{ .Data.data.tokenUrl }}
           OIDC_USERINFO_URL={{ .Data.data.userInfoUrl }}
           OIDC_LOGOUT_URL={{ .Data.data.logoutUrl }}
-          {{- end }}
-          {{- with secret "kv/data/atlas/jenkins/harbor-robot-creds" -}}
+          {{ end }}
+          {{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }}
+          HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
+          HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
+          {{ end }}
+          {{ with secret "kv/data/atlas/shared/harbor-pull" }}
+          {{- if and .Data.data.username .Data.data.password }}
           HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
           HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
           {{- end }}
-          {{- with secret "kv/data/atlas/jenkins/gitea-pat" -}}
+          {{ end }}
+          {{ with secret "kv/data/atlas/jenkins/gitea-pat" }}
           GITEA_PAT_USERNAME={{ .Data.data.username }}
           GITEA_PAT_TOKEN={{ .Data.data.token }}
-          {{- end -}}
-        bstein.dev/restarted-at: "2026-01-19T00:25:00Z"
+          {{ end }}
+          {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
+          TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
+          GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
+          {{ end }}
+        bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:
@@ -98,7 +108,9 @@ spec:
               containerPort: 50000
           env:
             - name: JAVA_OPTS
-              value: "-Xms512m -Xmx2048m"
+              value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago"
+            - name: TZ
+              value: "America/Chicago"
             - name: JENKINS_OPTS
               value: "--webroot=/var/jenkins_cache/war"
             - name: JENKINS_SLAVE_AGENT_PORT
@@ -148,6 +160,8 @@ spec:
               mountPath: /config/jcasc
             - name: init-scripts
               mountPath: /usr/share/jenkins/ref/init.groovy.d
+            - name: init-scripts
+              mountPath: /var/jenkins_home/init.groovy.d
             - name: plugin-dir
               mountPath: /usr/share/jenkins/ref/plugins
             - name: tmp
@@ -157,9 +171,11 @@ spec:
           persistentVolumeClaim:
             claimName: jenkins
         - name: jenkins-cache
-          emptyDir: {}
+          persistentVolumeClaim:
+            claimName: jenkins-cache-v2
         - name: plugin-dir
-          emptyDir: {}
+          persistentVolumeClaim:
+            claimName: jenkins-plugins-v2
         - name: plugins
           configMap:
             name: jenkins-plugins
@@ -170,4 +186,5 @@ spec:
           configMap:
             name: jenkins-init-scripts
         - name: tmp
-          emptyDir: {}
+          emptyDir:
+            medium: Memory
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index acb6fb4..df51968 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -5,9 +5,14 @@ namespace: jenkins
 resources:
   - namespace.yaml
   - serviceaccount.yaml
+  - vault-serviceaccount.yaml
   - pvc.yaml
+  - cache-pvc.yaml
+  - plugins-pvc.yaml
   - configmap-jcasc.yaml
   - configmap-plugins.yaml
+  - secretproviderclass.yaml
+  - vault-sync-deployment.yaml
   - deployment.yaml
   - service.yaml
   - ingress.yaml
@@ -16,6 +21,7 @@ configMapGenerator:
   - name: jenkins-init-scripts
     namespace: jenkins
     files:
+      - git-notify-token.groovy=scripts/git-notify-token.groovy
       - theme.groovy=scripts/theme.groovy
     options:
       disableNameSuffixHash: true
diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml
new file mode 100644
index 0000000..06715eb
--- /dev/null
+++ b/services/jenkins/plugins-pvc.yaml
@@ -0,0 +1,13 @@
+# services/jenkins/plugins-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: jenkins-plugins-v2
+  namespace: jenkins
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+  storageClassName: astreae
diff --git a/services/jenkins/scripts/git-notify-token.groovy b/services/jenkins/scripts/git-notify-token.groovy
new file mode 100644
index 0000000..336c918
--- /dev/null
+++ b/services/jenkins/scripts/git-notify-token.groovy
@@ -0,0 +1,41 @@
+import hudson.plugins.git.ApiTokenPropertyConfiguration
+import hudson.Util
+import java.nio.charset.StandardCharsets
+import java.security.MessageDigest
+
+
+def entries = [
+  [env: 'GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME', name: 'gitea-bstein-dev-home'],
+]
+
+entries.each { entry ->
+  def token = System.getenv(entry.env)
+  if (!token || token.trim().isEmpty()) {
+    println("Git notifyCommit token ${entry.env} missing; skipping")
+    return
+  }
+
+  try {
+    def config = ApiTokenPropertyConfiguration.get()
+    if (config.hasMatchingApiToken(token)) {
+      println("Git notifyCommit token ${entry.name} already configured")
+      return
+    }
+
+    def digest = MessageDigest.getInstance("SHA-256")
+    def hash = Util.toHexString(digest.digest(token.getBytes(StandardCharsets.US_ASCII)))
+
+    def field = ApiTokenPropertyConfiguration.class.getDeclaredField("apiTokens")
+    field.setAccessible(true)
+    def tokens = field.get(config)
+
+    def ctor = ApiTokenPropertyConfiguration.HashedApiToken.class.getDeclaredConstructor(String.class, String.class)
+    ctor.setAccessible(true)
+    tokens.add(ctor.newInstance(entry.name, hash))
+    config.save()
+
+    println("Added git notifyCommit access token ${entry.name}")
+  } catch (Throwable e) {
+    println("Failed to configure git notifyCommit token ${entry.name}: ${e.class.simpleName}: ${e.message}")
+  }
+}
diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy
index cf171f7..58755c0 100644
--- a/services/jenkins/scripts/theme.groovy
+++ b/services/jenkins/scripts/theme.groovy
@@ -1,15 +1,137 @@
 import jenkins.model.Jenkins
 import org.codefirst.SimpleThemeDecorator
+import org.jenkinsci.plugins.simpletheme.CssTextThemeElement
 
 def instance = Jenkins.get()
 def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
 
 if (decorators?.size() > 0) {
   def theme = decorators[0]
-  theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css")
+  def cssRules = """
+:root,
+.app-theme-picker__picker[data-theme=none] {
+  --background: #0f1216 !important;
+  --header-background: #141922 !important;
+  --header-border: #2b313b !important;
+  --white: #141922 !important;
+  --black: #e6e9ef !important;
+  --very-light-grey: #171b21 !important;
+  --light-grey: #202734 !important;
+  --medium-grey: #2b313b !important;
+  --dark-grey: #0b0f14 !important;
+  --text-color: #e6e9ef !important;
+  --text-color-secondary: #a6adba !important;
+  --card-background: #171b21 !important;
+  --card-border-color: #2b313b !important;
+  --pane-header-bg: #1f252d !important;
+  --pane-header-border-color: #2b313b !important;
+  --pane-border-color: #2b313b !important;
+  --pane-text-color: #e6e9ef !important;
+  --pane-header-text-color: #e6e9ef !important;
+  --link-color: #8fb7ff !important;
+  --link-color--hover: #b0ccff !important;
+  --link-dark-color: #e6e9ef !important;
+  --link-dark-color--hover: #b0ccff !important;
+  --input-color: #151a20 !important;
+  --input-border: #2b313b !important;
+  --input-border-hover: #3a424d !important;
+  --button-background: #232a33 !important;
+  --button-background--hover: #2b313b !important;
+  --button-background--active: #323b46 !important;
+  --item-background--hover: #232a33 !important;
+  --item-background--active: #2b313b !important;
+  --accent-color: #8fb7ff !important;
+}
+
+body,
+#page-body,
+#page-header,
+#header,
+#main-panel,
+#main-panel-content,
+#side-panel,
+.top-sticker-inner,
+.bottom-sticker-inner,
+#breadcrumbBar,
+#breadcrumbs {
+  background-color: var(--background) !important;
+  color: var(--text-color) !important;
+}
+
+.jenkins-card,
+.jenkins-section,
+.jenkins-section__item,
+#main-panel .jenkins-card,
+#main-panel .jenkins-section {
+  background-color: var(--card-background) !important;
+  color: var(--text-color) !important;
+  border-color: var(--card-border-color) !important;
+}
+
+table.pane,
+table.pane td,
+table.pane th,
+#projectstatus td,
+#projectstatus th {
+  background-color: var(--card-background) !important;
+  color: var(--text-color) !important;
+}
+
+table.pane tr:nth-child(even) td,
+#projectstatus tr:hover td {
+  background-color: #1f252d !important;
+}
+
+input,
+select,
+textarea,
+#search-box {
+  background-color: #151a20 !important;
+  color: var(--text-color) !important;
+  border-color: var(--input-border) !important;
+}
+
+a,
+a:visited,
+a:link {
+  color: var(--link-color) !important;
+}
+
+a:hover {
+  opacity: 0.85;
+}
+
+#side-panel .task-link,
+#breadcrumbs a,
+#breadcrumbs,
+#projectstatus th a {
+  color: var(--text-color-secondary) !important;
+}
+
+.console-output,
+.console-output pre,
+pre,
+code,
+.CodeMirror {
+  background-color: #0c0f14 !important;
+  color: #d9dee7 !important;
+}
+
+#footer {
+  background-color: var(--background) !important;
+  color: var(--text-color-secondary) !important;
+}
+
+.jenkins_ver:after {
+  content: "atlas dark";
+}
+""".stripIndent().trim()
+
+  theme.setElements([new CssTextThemeElement(cssRules)])
+  theme.setCssUrl("")
+  theme.setCssRules(cssRules)
   theme.setJsUrl("")
-  theme.setTheme("")
-  instance.save()
+  theme.save()
   println("Applied simple-theme-plugin dark theme")
 } else {
   println("simple-theme-plugin not installed; skipping theme configuration")
diff --git a/services/jenkins/secretproviderclass.yaml b/services/jenkins/secretproviderclass.yaml
new file mode 100644
index 0000000..a9d9dd5
--- /dev/null
+++ b/services/jenkins/secretproviderclass.yaml
@@ -0,0 +1,21 @@
+# services/jenkins/secretproviderclass.yaml
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: jenkins-vault
+  namespace: jenkins
+spec:
+  provider: vault
+  parameters:
+    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
+    roleName: "jenkins"
+    objects: |
+      - objectName: "harbor-pull__dockerconfigjson"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
+        secretKey: "dockerconfigjson"
+  secretObjects:
+    - secretName: harbor-bstein-robot
+      type: kubernetes.io/dockerconfigjson
+      data:
+        - objectName: harbor-pull__dockerconfigjson
+          key: .dockerconfigjson
diff --git a/services/jenkins/vault-serviceaccount.yaml b/services/jenkins/vault-serviceaccount.yaml
new file mode 100644
index 0000000..8d31400
--- /dev/null
+++ b/services/jenkins/vault-serviceaccount.yaml
@@ -0,0 +1,6 @@
+# services/jenkins/vault-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: jenkins-vault-sync
+  namespace: jenkins
diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml
new file mode 100644
index 0000000..6abcace
--- /dev/null
+++ b/services/jenkins/vault-sync-deployment.yaml
@@ -0,0 +1,37 @@
+# services/jenkins/vault-sync-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: jenkins-vault-sync
+  namespace: jenkins
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: jenkins-vault-sync
+  template:
+    metadata:
+      labels:
+        app: jenkins-vault-sync
+    spec:
+      serviceAccountName: jenkins-vault-sync
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: sync
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - "sleep infinity"
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /vault/secrets
+              readOnly: true
+      volumes:
+        - name: vault-secrets
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: jenkins-vault
diff --git a/services/keycloak/deployment.yaml b/services/keycloak/deployment.yaml
index 3d241c9..131169d 100644
--- a/services/keycloak/deployment.yaml
+++ b/services/keycloak/deployment.yaml
@@ -126,7 +126,7 @@ spec:
             - name: KC_EVENTS_LISTENERS
               value: jboss-logging,mailu-http
             - name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
-              value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
+              value: http://ariadne.maintenance.svc.cluster.local/events
           ports:
             - containerPort: 8080
               name: http
diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml
index 6030a82..6027891 100644
--- a/services/keycloak/kustomization.yaml
+++ b/services/keycloak/kustomization.yaml
@@ -10,21 +10,21 @@ resources:
   - secretproviderclass.yaml
   - vault-sync-deployment.yaml
   - deployment.yaml
-  - realm-settings-job.yaml
-  - portal-admin-client-secret-ensure-job.yaml
-  - portal-e2e-client-job.yaml
-  - portal-e2e-target-client-job.yaml
-  - portal-e2e-token-exchange-permissions-job.yaml
-  - portal-e2e-token-exchange-test-job.yaml
-  - portal-e2e-execute-actions-email-test-job.yaml
-  - ldap-federation-job.yaml
-  - user-overrides-job.yaml
-  - mas-secrets-ensure-job.yaml
-  - synapse-oidc-secret-ensure-job.yaml
-  - logs-oidc-secret-ensure-job.yaml
-  - harbor-oidc-secret-ensure-job.yaml
-  - vault-oidc-secret-ensure-job.yaml
-  - actual-oidc-secret-ensure-job.yaml
+  - oneoffs/realm-settings-job.yaml
+  - oneoffs/portal-admin-client-secret-ensure-job.yaml
+  - oneoffs/portal-e2e-client-job.yaml
+  - oneoffs/portal-e2e-target-client-job.yaml
+  - oneoffs/portal-e2e-token-exchange-permissions-job.yaml
+  - oneoffs/portal-e2e-token-exchange-test-job.yaml
+  - oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+  - oneoffs/ldap-federation-job.yaml
+  - oneoffs/user-overrides-job.yaml
+  - oneoffs/mas-secrets-ensure-job.yaml
+  - oneoffs/synapse-oidc-secret-ensure-job.yaml
+  - oneoffs/logs-oidc-secret-ensure-job.yaml
+  - oneoffs/harbor-oidc-secret-ensure-job.yaml
+  - oneoffs/vault-oidc-secret-ensure-job.yaml
+  - oneoffs/actual-oidc-secret-ensure-job.yaml
   - service.yaml
   - ingress.yaml
 generatorOptions:
diff --git a/services/keycloak/actual-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
similarity index 83%
rename from services/keycloak/actual-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
index 3dadb52..d4da1f1 100644
--- a/services/keycloak/actual-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/actual-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
+# One-off job for sso/actual-oidc-secret-ensure-3.
+# Purpose: actual oidc secret ensure 3 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: actual-oidc-secret-ensure-3
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
similarity index 81%
rename from services/keycloak/harbor-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
index 8eac50d..c368241 100644
--- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/harbor-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
+# One-off job for sso/harbor-oidc-secret-ensure-10.
+# Purpose: harbor oidc secret ensure 10 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: harbor-oidc-secret-ensure-9
+  name: harbor-oidc-secret-ensure-10
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/oneoffs/ldap-federation-job.yaml
similarity index 86%
rename from services/keycloak/ldap-federation-job.yaml
rename to services/keycloak/oneoffs/ldap-federation-job.yaml
index 303fd9f..9e9a5f9 100644
--- a/services/keycloak/ldap-federation-job.yaml
+++ b/services/keycloak/oneoffs/ldap-federation-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/ldap-federation-job.yaml
+# services/keycloak/oneoffs/ldap-federation-job.yaml
+# One-off job for sso/keycloak-ldap-federation-12.
+# Purpose: keycloak ldap federation 12 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-ldap-federation-11
+  name: keycloak-ldap-federation-12
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     metadata:
@@ -325,6 +330,54 @@ spec:
                   if status not in (201, 204):
                       raise SystemExit(f"Unexpected group mapper create status: {status}")
 
+              def ensure_user_attr_mapper(name: str, ldap_attr: str, user_attr: str):
+                  mapper = None
+                  for c in components:
+                      if c.get("name") == name and c.get("parentId") == ldap_component_id:
+                          mapper = c
+                          break
+
+                  payload = {
+                      "name": name,
+                      "providerId": "user-attribute-ldap-mapper",
+                      "providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper",
+                      "parentId": ldap_component_id,
+                      "config": {
+                          "ldap.attribute": [ldap_attr],
+                          "user.model.attribute": [user_attr],
+                          "read.only": ["false"],
+                          "always.read.value.from.ldap": ["false"],
+                          "is.mandatory.in.ldap": ["false"],
+                      },
+                  }
+
+                  if mapper:
+                      payload["id"] = mapper["id"]
+                      payload["parentId"] = mapper.get("parentId", payload["parentId"])
+                      print(f"Updating LDAP user mapper: {payload['id']} ({name})")
+                      status, _, _ = http_json(
+                          "PUT",
+                          f"{base_url}/admin/realms/{realm}/components/{payload['id']}",
+                          token,
+                          payload,
+                      )
+                      if status not in (200, 204):
+                          raise SystemExit(f"Unexpected user mapper update status for {name}: {status}")
+                  else:
+                      print(f"Creating LDAP user mapper: {name}")
+                      status, _, _ = http_json(
+                          "POST",
+                          f"{base_url}/admin/realms/{realm}/components",
+                          token,
+                          payload,
+                      )
+                      if status not in (201, 204):
+                          raise SystemExit(f"Unexpected user mapper create status for {name}: {status}")
+
+              ensure_user_attr_mapper("openldap-email", "mail", "email")
+              ensure_user_attr_mapper("openldap-first-name", "givenName", "firstName")
+              ensure_user_attr_mapper("openldap-last-name", "sn", "lastName")
+
               # Cleanup duplicate LDAP federation providers and their child components (mappers, etc).
               # Keep only the canonical provider we updated/created above.
               try:
diff --git a/services/keycloak/logs-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
similarity index 94%
rename from services/keycloak/logs-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
index 14e80df..bce9e5b 100644
--- a/services/keycloak/logs-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/logs-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
+# One-off job for sso/logs-oidc-secret-ensure-10.
+# Purpose: logs oidc secret ensure 10 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: logs-oidc-secret-ensure-10
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/mas-secrets-ensure-job.yaml b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
similarity index 95%
rename from services/keycloak/mas-secrets-ensure-job.yaml
rename to services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
index 24c9e04..c3bd1be 100644
--- a/services/keycloak/mas-secrets-ensure-job.yaml
+++ b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
@@ -1,4 +1,8 @@
-# services/keycloak/mas-secrets-ensure-job.yaml
+# services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
+# One-off job for sso/mas-secrets-ensure.
+# Purpose: mas secrets ensure (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: v1
 kind: ServiceAccount
 metadata:
@@ -13,6 +17,7 @@ metadata:
   name: mas-secrets-ensure-21
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/portal-admin-client-secret-ensure-job.yaml b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
similarity index 96%
rename from services/keycloak/portal-admin-client-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
index 90dd4b7..1d3e7f3 100644
--- a/services/keycloak/portal-admin-client-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-admin-client-secret-ensure-job.yaml
+# services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
+# One-off job for sso/keycloak-portal-admin-secret-ensure-4.
+# Purpose: keycloak portal admin secret ensure 4 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-admin-secret-ensure-4
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-client-job.yaml
similarity index 97%
rename from services/keycloak/portal-e2e-client-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-client-job.yaml
index 4e0c006..274dd27 100644
--- a/services/keycloak/portal-e2e-client-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-client-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-client-job.yaml
+# services/keycloak/oneoffs/portal-e2e-client-job.yaml
+# One-off job for sso/keycloak-portal-e2e-client-8.
+# Purpose: keycloak portal e2e client 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-client-8
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
similarity index 89%
rename from services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
index 35f79a6..518d839 100644
--- a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
+# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
+# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-execute-actions-email-14
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 3
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-target-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
similarity index 95%
rename from services/keycloak/portal-e2e-target-client-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
index 196b48b..900d029 100644
--- a/services/keycloak/portal-e2e-target-client-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-target-client-job.yaml
+# services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
+# One-off job for sso/keycloak-portal-e2e-target-7.
+# Purpose: keycloak portal e2e target 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-target-7
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
similarity index 97%
rename from services/keycloak/portal-e2e-token-exchange-permissions-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
index 647b8f9..0d41b47 100644
--- a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-token-exchange-permissions-job.yaml
+# services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
+# One-off job for sso/keycloak-portal-e2e-token-exchange-permissions-11.
+# Purpose: keycloak portal e2e token exchange permissions 11 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-token-exchange-permissions-11
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 6
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-token-exchange-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
similarity index 89%
rename from services/keycloak/portal-e2e-token-exchange-test-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
index edd7555..eb05e09 100644
--- a/services/keycloak/portal-e2e-token-exchange-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-token-exchange-test-job.yaml
+# services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
+# One-off job for sso/keycloak-portal-e2e-token-exchange-test-7.
+# Purpose: keycloak portal e2e token exchange test 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-token-exchange-test-7
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 6
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/oneoffs/realm-settings-job.yaml
similarity index 78%
rename from services/keycloak/realm-settings-job.yaml
rename to services/keycloak/oneoffs/realm-settings-job.yaml
index f680200..ea88d83 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/oneoffs/realm-settings-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/realm-settings-job.yaml
+# services/keycloak/oneoffs/realm-settings-job.yaml
+# One-off job for sso/keycloak-realm-settings-36.
+# Purpose: keycloak realm settings 36 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-32
+  name: keycloak-realm-settings-36
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
@@ -331,6 +336,9 @@ spec:
               # Ensure basic realm groups exist for provisioning.
               ensure_group("dev")
               ensure_group("admin")
+              ensure_group("demo")
+              ensure_group("test")
+              ensure_group("vaultwarden_grandfathered")
               planka_group = ensure_group("planka-users")
 
               if planka_group and planka_group.get("id"):
@@ -467,6 +475,126 @@ spec:
                       if status not in (201, 204):
                           raise SystemExit(f"Unexpected protocol mapper create response: {status}")
 
+              # Ensure mailu_email overrides email claim for service clients.
+              excluded_email_clients = {
+                  "account",
+                  "account-console",
+                  "admin-cli",
+                  "security-admin-console",
+                  "realm-management",
+                  "broker",
+              }
+              status, clients = http_json(
+                  "GET",
+                  f"{base_url}/admin/realms/{realm}/clients",
+                  access_token,
+              )
+              if status == 200 and isinstance(clients, list):
+                  for client in clients:
+                      if not isinstance(client, dict):
+                          continue
+                      if client.get("protocol") != "openid-connect":
+                          continue
+                      client_name = client.get("clientId") if isinstance(client.get("clientId"), str) else ""
+                      if not client_name or client_name in excluded_email_clients:
+                          continue
+                      client_id = client.get("id")
+                      if not client_id:
+                          continue
+                      email_mapper = {
+                          "name": "mailu-email",
+                          "protocol": "openid-connect",
+                          "protocolMapper": "oidc-usermodel-attribute-mapper",
+                          "consentRequired": False,
+                          "config": {
+                              "user.attribute": "mailu_email",
+                              "claim.name": "email",
+                              "jsonType.label": "String",
+                              "id.token.claim": "true",
+                              "access.token.claim": "true",
+                              "userinfo.token.claim": "true",
+                              "multivalued": "false",
+                              "aggregate.attrs": "false",
+                          },
+                      }
+                      status, mappers = http_json(
+                          "GET",
+                          f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                          access_token,
+                      )
+                      existing = None
+                      if status == 200 and isinstance(mappers, list):
+                          for item in mappers:
+                              if isinstance(item, dict) and item.get("name") == email_mapper["name"]:
+                                  existing = item
+                                  break
+                      if existing and existing.get("id"):
+                          email_mapper["id"] = existing["id"]
+                          status, _ = http_json(
+                              "PUT",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing['id']}",
+                              access_token,
+                              email_mapper,
+                          )
+                          if status not in (200, 204):
+                              raise SystemExit(f"Unexpected mailu email mapper update response: {status}")
+                      else:
+                          status, _ = http_json(
+                              "POST",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                              access_token,
+                              email_mapper,
+                          )
+                          if status not in (201, 204):
+                              raise SystemExit(f"Unexpected mailu email mapper create response: {status}")
+
+                      mailu_claim_mapper = {
+                          "name": "mailu-email-claim",
+                          "protocol": "openid-connect",
+                          "protocolMapper": "oidc-usermodel-attribute-mapper",
+                          "consentRequired": False,
+                          "config": {
+                              "user.attribute": "mailu_email",
+                              "claim.name": "mailu_email",
+                              "jsonType.label": "String",
+                              "id.token.claim": "true",
+                              "access.token.claim": "true",
+                              "userinfo.token.claim": "true",
+                              "multivalued": "false",
+                              "aggregate.attrs": "false",
+                          },
+                      }
+                      status, mappers = http_json(
+                          "GET",
+                          f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                          access_token,
+                      )
+                      existing_claim = None
+                      if status == 200 and isinstance(mappers, list):
+                          for item in mappers:
+                              if isinstance(item, dict) and item.get("name") == mailu_claim_mapper["name"]:
+                                  existing_claim = item
+                                  break
+                      if existing_claim and existing_claim.get("id"):
+                          mailu_claim_mapper["id"] = existing_claim["id"]
+                          status, _ = http_json(
+                              "PUT",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing_claim['id']}",
+                              access_token,
+                              mailu_claim_mapper,
+                          )
+                          if status not in (200, 204):
+                              raise SystemExit(f"Unexpected mailu email claim mapper update response: {status}")
+                      else:
+                          status, _ = http_json(
+                              "POST",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                              access_token,
+                              mailu_claim_mapper,
+                          )
+                          if status not in (201, 204):
+                              raise SystemExit(f"Unexpected mailu email claim mapper create response: {status}")
+
               # Ensure MFA is on by default for newly-created users.
               status, required_actions = http_json(
                   "GET",
diff --git a/services/keycloak/synapse-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
similarity index 92%
rename from services/keycloak/synapse-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
index e808e7e..15b7a31 100644
--- a/services/keycloak/synapse-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/synapse-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
+# One-off job for sso/synapse-oidc-secret-ensure-10.
+# Purpose: synapse oidc secret ensure 10 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: synapse-oidc-secret-ensure-10
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/user-overrides-job.yaml b/services/keycloak/oneoffs/user-overrides-job.yaml
similarity index 96%
rename from services/keycloak/user-overrides-job.yaml
rename to services/keycloak/oneoffs/user-overrides-job.yaml
index 7623c84..0d52d6d 100644
--- a/services/keycloak/user-overrides-job.yaml
+++ b/services/keycloak/oneoffs/user-overrides-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/user-overrides-job.yaml
+# services/keycloak/oneoffs/user-overrides-job.yaml
+# One-off job for sso/keycloak-user-overrides-9.
+# Purpose: keycloak user overrides 9 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-user-overrides-9
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/vault-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
similarity index 83%
rename from services/keycloak/vault-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
index 3aa3ca5..a76c52e 100644
--- a/services/keycloak/vault-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/vault-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
+# One-off job for sso/vault-oidc-secret-ensure-8.
+# Purpose: vault oidc secret ensure 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: vault-oidc-secret-ensure-8
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh
index 7187d34..c70caa2 100755
--- a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh
+++ b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh
@@ -29,7 +29,7 @@ CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
 CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
 
 if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
-  create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}'
+  create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":true,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}'
   status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
     -H "Authorization: Bearer ${ACCESS_TOKEN}" \
     -H 'Content-Type: application/json' \
@@ -49,6 +49,21 @@ if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
   exit 1
 fi
 
+CLIENT_CONFIG="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+  "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}" || true)"
+if [ -n "$CLIENT_CONFIG" ]; then
+  updated_config="$(echo "$CLIENT_CONFIG" | jq '.directAccessGrantsEnabled=true')"
+  status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
+    -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+    -H 'Content-Type: application/json' \
+    -d "${updated_config}" \
+    "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
+  if [ "$status" != "200" ] && [ "$status" != "204" ]; then
+    echo "Keycloak client update failed (status ${status})" >&2
+    exit 1
+  fi
+fi
+
 SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
   "$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
 if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
@@ -77,6 +92,26 @@ if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2
   fi
 fi
 
+OFFLINE_SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+  "$KC_URL/admin/realms/atlas/client-scopes?search=offline_access" | jq -r '.[] | select(.name=="offline_access") | .id' 2>/dev/null | head -n1 || true)"
+if [ -n "$OFFLINE_SCOPE_ID" ] && [ "$OFFLINE_SCOPE_ID" != "null" ]; then
+  if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1 \
+    && ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1; then
+    status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
+      -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+      "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")"
+    if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
+      status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
+        -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+        "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")"
+      if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
+        echo "Failed to attach offline_access scope to harbor (status ${status})" >&2
+        exit 1
+      fi
+    fi
+  fi
+fi
+
 CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
   "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
 if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
diff --git a/services/keycloak/secretproviderclass.yaml b/services/keycloak/secretproviderclass.yaml
index 86cebd2..d4c094f 100644
--- a/services/keycloak/secretproviderclass.yaml
+++ b/services/keycloak/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "sso"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/sso"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml
index 08c73a8..dc48715 100644
--- a/services/logging/kustomization.yaml
+++ b/services/logging/kustomization.yaml
@@ -15,9 +15,9 @@ resources:
   - opensearch-dashboards-helmrelease.yaml
   - data-prepper-helmrelease.yaml
   - otel-collector-helmrelease.yaml
-  - opensearch-ism-job.yaml
-  - opensearch-dashboards-setup-job.yaml
-  - opensearch-observability-setup-job.yaml
+  - oneoffs/opensearch-ism-job.yaml
+  - oneoffs/opensearch-dashboards-setup-job.yaml
+  - oneoffs/opensearch-observability-setup-job.yaml
   - opensearch-prune-cronjob.yaml
   - fluent-bit-helmrelease.yaml
   - node-log-rotation-daemonset.yaml
diff --git a/services/logging/opensearch-dashboards-setup-job.yaml b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
similarity index 88%
rename from services/logging/opensearch-dashboards-setup-job.yaml
rename to services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
index 06149d7..1d1a9b6 100644
--- a/services/logging/opensearch-dashboards-setup-job.yaml
+++ b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
@@ -1,10 +1,15 @@
-# services/logging/opensearch-dashboards-setup-job.yaml
+# services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
+# One-off job for logging/opensearch-dashboards-setup-4.
+# Purpose: opensearch dashboards setup 4 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: opensearch-dashboards-setup-4
   namespace: logging
 spec:
+  suspend: true
   backoffLimit: 3
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/oneoffs/opensearch-ism-job.yaml
similarity index 91%
rename from services/logging/opensearch-ism-job.yaml
rename to services/logging/oneoffs/opensearch-ism-job.yaml
index 3313571..476bca7 100644
--- a/services/logging/opensearch-ism-job.yaml
+++ b/services/logging/oneoffs/opensearch-ism-job.yaml
@@ -1,10 +1,15 @@
-# services/logging/opensearch-ism-job.yaml
+# services/logging/oneoffs/opensearch-ism-job.yaml
+# One-off job for logging/opensearch-ism-setup-5.
+# Purpose: opensearch ism setup 5 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: opensearch-ism-setup-5
   namespace: logging
 spec:
+  suspend: true
   backoffLimit: 3
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/oneoffs/opensearch-observability-setup-job.yaml
similarity index 76%
rename from services/logging/opensearch-observability-setup-job.yaml
rename to services/logging/oneoffs/opensearch-observability-setup-job.yaml
index e4590fb..6caa076 100644
--- a/services/logging/opensearch-observability-setup-job.yaml
+++ b/services/logging/oneoffs/opensearch-observability-setup-job.yaml
@@ -1,10 +1,15 @@
-# services/logging/opensearch-observability-setup-job.yaml
+# services/logging/oneoffs/opensearch-observability-setup-job.yaml
+# One-off job for logging/opensearch-observability-setup-2.
+# Purpose: opensearch observability setup 2 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: opensearch-observability-setup-2
   namespace: logging
 spec:
+  suspend: true
   backoffLimit: 3
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml
index 75e72db..dc0dffb 100644
--- a/services/logging/opensearch-prune-cronjob.yaml
+++ b/services/logging/opensearch-prune-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: logging
 spec:
   schedule: "23 3 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/logging/secretproviderclass.yaml b/services/logging/secretproviderclass.yaml
index f5db15e..6ff642d 100644
--- a/services/logging/secretproviderclass.yaml
+++ b/services/logging/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "logging"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/logging"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 7342141..2a7e6f5 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -219,6 +219,8 @@ spec:
       overrides:
         postfix.cf: |
           mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24
+          recipient_canonical_maps = regexp:/overrides/recipient_canonical, socketmap:unix:/tmp/podop.socket:recipientmap
+          recipient_canonical_classes = envelope_recipient,header_recipient
           smtpd_delay_reject = yes
           smtpd_helo_required = yes
           smtpd_helo_restrictions = reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_helo_hostname
@@ -238,8 +240,10 @@ spec:
           smtpd_client_message_rate_limit = 100
           smtpd_client_recipient_rate_limit = 200
           smtpd_recipient_limit = 100
+        recipient_canonical: |
+          /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev
       podAnnotations:
-        bstein.dev/restarted-at: "2026-01-06T00:00:00Z"
+        bstein.dev/restarted-at: "2026-01-20T04:35:00Z"
     redis:
       enabled: true
       architecture: standalone
@@ -335,8 +339,15 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
+                        {{- else }}
                         export RELAYUSER="{{ index .Data.data "apikey" }}"
                         export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -397,8 +408,15 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
+                        {{- else }}
                         export RELAYUSER="{{ index .Data.data "apikey" }}"
                         export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -441,6 +459,8 @@ spec:
               metadata:
                 name: mailu-postfix
               spec:
+                strategy:
+                  type: Recreate
                 template:
                   metadata:
                     annotations:
@@ -459,8 +479,15 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
+                        {{- else }}
                         export RELAYUSER="{{ index .Data.data "apikey" }}"
                         export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -521,8 +548,15 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
+                        {{- else }}
                         export RELAYUSER="{{ index .Data.data "apikey" }}"
                         export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -583,8 +617,15 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
+                        {{- else }}
                         export RELAYUSER="{{ index .Data.data "apikey" }}"
                         export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -645,8 +686,15 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
+                        {{- else }}
                         export RELAYUSER="{{ index .Data.data "apikey" }}"
                         export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml
index 5c111eb..3e0494e 100644
--- a/services/mailu/kustomization.yaml
+++ b/services/mailu/kustomization.yaml
@@ -13,9 +13,8 @@ resources:
   - unbound-configmap.yaml
   - serverstransport.yaml
   - ingressroute.yaml
-  - mailu-sync-job.yaml
+  - oneoffs/mailu-sync-job.yaml
   - mailu-sync-cronjob.yaml
-  - mailu-sync-listener.yaml
   - front-lb.yaml
 
 configMapGenerator:
@@ -31,10 +30,6 @@ configMapGenerator:
       - sync.py=scripts/mailu_sync.py
     options:
       disableNameSuffixHash: true
-  - name: mailu-sync-listener
-    namespace: mailu-mailserver
-    files:
-      - listener.py=scripts/mailu_sync_listener.py
   - name: mailu-vault-entrypoint
     namespace: mailu-mailserver
     files:
diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml
index 1da1981..bbe9909 100644
--- a/services/mailu/mailu-sync-cronjob.yaml
+++ b/services/mailu/mailu-sync-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "30 4 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   jobTemplate:
     spec:
@@ -37,6 +38,9 @@ spec:
               {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}}
         spec:
           restartPolicy: OnFailure
+          nodeSelector:
+            kubernetes.io/arch: arm64
+            node-role.kubernetes.io/worker: "true"
           serviceAccountName: mailu-vault-sync
           containers:
             - name: mailu-sync
diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml
index cc98107..0644c5b 100644
--- a/services/mailu/mailu-sync-listener.yaml
+++ b/services/mailu/mailu-sync-listener.yaml
@@ -30,7 +30,7 @@ spec:
         app: mailu-sync-listener
       annotations:
         vault.hashicorp.com/agent-inject: "true"
-        atlas.bstein.dev/mailu-sync-rev: "2"
+        atlas.bstein.dev/mailu-sync-rev: "4"
         vault.hashicorp.com/role: "mailu-mailserver"
         vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret"
         vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: |
@@ -52,6 +52,9 @@ spec:
           {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}}
     spec:
       restartPolicy: Always
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
       serviceAccountName: mailu-vault-sync
       containers:
         - name: listener
diff --git a/services/mailu/mailu-sync-job.yaml b/services/mailu/oneoffs/mailu-sync-job.yaml
similarity index 93%
rename from services/mailu/mailu-sync-job.yaml
rename to services/mailu/oneoffs/mailu-sync-job.yaml
index 8589e9e..38648ac 100644
--- a/services/mailu/mailu-sync-job.yaml
+++ b/services/mailu/oneoffs/mailu-sync-job.yaml
@@ -1,10 +1,15 @@
-# services/mailu/mailu-sync-job.yaml
+# services/mailu/oneoffs/mailu-sync-job.yaml
+# One-off job for mailu-mailserver/mailu-sync-9.
+# Purpose: mailu sync 9 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: mailu-sync-9
   namespace: mailu-mailserver
 spec:
+  suspend: true
   template:
     metadata:
       annotations:
diff --git a/services/mailu/scripts/mailu_sync.py b/services/mailu/scripts/mailu_sync.py
index 001917a..71b0f5a 100644
--- a/services/mailu/scripts/mailu_sync.py
+++ b/services/mailu/scripts/mailu_sync.py
@@ -130,7 +130,9 @@ def kc_update_attributes(token, user, attributes):
     if not isinstance(current_attrs, dict):
         current_attrs = {}
     current_attrs.update(attributes)
-    resp = SESSION.put(user_url, headers=headers, json={"attributes": current_attrs}, timeout=20)
+    payload = _safe_update_payload(current_payload)
+    payload["attributes"] = current_attrs
+    resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20)
     resp.raise_for_status()
     verify = SESSION.get(
         user_url,
@@ -144,6 +146,34 @@ def kc_update_attributes(token, user, attributes):
         raise Exception(f"attribute not persisted for {user.get('email') or user['username']}")
 
 
+def _safe_update_payload(user_payload: dict) -> dict:
+    payload: dict = {}
+    username = user_payload.get("username")
+    if isinstance(username, str):
+        payload["username"] = username
+    enabled = user_payload.get("enabled")
+    if isinstance(enabled, bool):
+        payload["enabled"] = enabled
+    email = user_payload.get("email")
+    if isinstance(email, str):
+        payload["email"] = email
+    email_verified = user_payload.get("emailVerified")
+    if isinstance(email_verified, bool):
+        payload["emailVerified"] = email_verified
+    first_name = user_payload.get("firstName")
+    if isinstance(first_name, str):
+        payload["firstName"] = first_name
+    last_name = user_payload.get("lastName")
+    if isinstance(last_name, str):
+        payload["lastName"] = last_name
+    actions = user_payload.get("requiredActions")
+    if isinstance(actions, list):
+        payload["requiredActions"] = [a for a in actions if isinstance(a, str)]
+    attrs = user_payload.get("attributes")
+    payload["attributes"] = attrs if isinstance(attrs, dict) else {}
+    return payload
+
+
 def random_password():
     alphabet = string.ascii_letters + string.digits
     return "".join(secrets.choice(alphabet) for _ in range(24))
diff --git a/services/mailu/scripts/mailu_sync_listener.py b/services/mailu/scripts/mailu_sync_listener.py
index 6ac0da7..4e31c81 100644
--- a/services/mailu/scripts/mailu_sync_listener.py
+++ b/services/mailu/scripts/mailu_sync_listener.py
@@ -39,12 +39,12 @@ def _run_sync_blocking() -> int:
             sync_done.set()
 
 
-def _trigger_sync_async() -> bool:
+def _trigger_sync_async(force: bool = False) -> bool:
     with lock:
         now = time()
         if sync_running:
             return False
-        if now - last_run < MIN_INTERVAL_SECONDS:
+        if not force and now - last_run < MIN_INTERVAL_SECONDS:
             return False
 
     thread = threading.Thread(target=_run_sync_blocking, daemon=True)
@@ -64,15 +64,17 @@ class Handler(http.server.BaseHTTPRequestHandler):
             return
 
         wait = False
+        force = False
         if isinstance(payload, dict):
             wait = bool(payload.get("wait"))
+            force = bool(payload.get("force"))
 
         if wait:
             with lock:
                 already_running = sync_running
 
             if not already_running:
-                _trigger_sync_async()
+                _trigger_sync_async(force=force)
 
             sync_done.wait(timeout=WAIT_TIMEOUT_SECONDS)
             with lock:
@@ -87,7 +89,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
             self.end_headers()
             return
 
-        _trigger_sync_async()
+        _trigger_sync_async(force=force)
         self.send_response(202)
         self.end_headers()
 
diff --git a/services/mailu/secretproviderclass.yaml b/services/mailu/secretproviderclass.yaml
index f58c69b..f9e281e 100644
--- a/services/mailu/secretproviderclass.yaml
+++ b/services/mailu/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "mailu-mailserver"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/mailu-mailserver"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
new file mode 100644
index 0000000..fce1ded
--- /dev/null
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -0,0 +1,359 @@
+# services/maintenance/ariadne-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  replicas: 1
+  revisionHistoryLimit: 3
+  selector:
+    matchLabels:
+      app: ariadne
+  template:
+    metadata:
+      labels:
+        app: ariadne
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/role: "maintenance"
+        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
+        vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
+          {{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
+          export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }}
+          export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/nextcloud/nextcloud-db" }}
+          export NEXTCLOUD_DB_NAME="{{ .Data.data.database }}"
+          export NEXTCLOUD_DB_USER="{{ index .Data.data "db-username" }}"
+          export NEXTCLOUD_DB_PASSWORD="{{ index .Data.data "db-password" }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/nextcloud/nextcloud-admin" }}
+          export NEXTCLOUD_ADMIN_USER="{{ index .Data.data "admin-user" }}"
+          export NEXTCLOUD_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/health/wger-admin" }}
+          export WGER_ADMIN_USERNAME="{{ .Data.data.username }}"
+          export WGER_ADMIN_PASSWORD="{{ .Data.data.password }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/finance/firefly-secrets" }}
+          export FIREFLY_CRON_TOKEN="{{ .Data.data.STATIC_CRON_TOKEN }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }}
+          export MAILU_DB_NAME="{{ .Data.data.database }}"
+          export MAILU_DB_USER="{{ .Data.data.username }}"
+          export MAILU_DB_PASSWORD="{{ .Data.data.password }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/mailu/mailu-initial-account-secret" }}
+          export SMTP_HOST="mailu-front.mailu-mailserver.svc.cluster.local"
+          export SMTP_PORT="587"
+          export SMTP_STARTTLS="true"
+          export SMTP_USE_TLS="false"
+          export SMTP_USERNAME="no-reply-portal@bstein.dev"
+          export SMTP_PASSWORD="{{ .Data.data.password }}"
+          export SMTP_FROM="no-reply-portal@bstein.dev"
+          export MAILU_SYSTEM_PASSWORD="{{ .Data.data.password }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }}
+          export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" }}
+          export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}"
+          export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/comms/synapse-admin" }}
+          export COMMS_SYNAPSE_ADMIN_TOKEN="{{ .Data.data.access_token }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/comms/synapse-db" }}
+          export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/vault/vault-oidc-config" }}
+          export VAULT_OIDC_DISCOVERY_URL="{{ .Data.data.discovery_url }}"
+          export VAULT_OIDC_CLIENT_ID="{{ .Data.data.client_id }}"
+          export VAULT_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          export VAULT_OIDC_DEFAULT_ROLE="{{ .Data.data.default_role }}"
+          export VAULT_OIDC_SCOPES="{{ .Data.data.scopes }}"
+          export VAULT_OIDC_USER_CLAIM="{{ .Data.data.user_claim }}"
+          export VAULT_OIDC_GROUPS_CLAIM="{{ .Data.data.groups_claim }}"
+          export VAULT_OIDC_TOKEN_POLICIES="{{ .Data.data.token_policies }}"
+          export VAULT_OIDC_ADMIN_GROUP="{{ .Data.data.admin_group }}"
+          export VAULT_OIDC_ADMIN_POLICIES="{{ .Data.data.admin_policies }}"
+          export VAULT_OIDC_DEV_GROUP="{{ .Data.data.dev_group }}"
+          export VAULT_OIDC_DEV_POLICIES="{{ .Data.data.dev_policies }}"
+          export VAULT_OIDC_USER_GROUP="{{ .Data.data.user_group }}"
+          export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}"
+          export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}"
+          export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}"
+          {{- if .Data.data.bound_claims_type }}
+          export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}"
+          {{- else }}
+          export VAULT_OIDC_BOUND_CLAIMS_TYPE="string"
+          {{- end }}
+          {{ end }}
+    spec:
+      serviceAccountName: ariadne
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: ariadne
+          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-c"]
+          args:
+            - >-
+              . /vault/secrets/ariadne-env.sh
+              && exec uvicorn ariadne.app:app --host 0.0.0.0 --port 8080
+          ports:
+            - name: http
+              containerPort: 8080
+          env:
+            - name: KEYCLOAK_URL
+              value: https://sso.bstein.dev
+            - name: KEYCLOAK_REALM
+              value: atlas
+            - name: KEYCLOAK_CLIENT_ID
+              value: bstein-dev-home
+            - name: KEYCLOAK_ISSUER
+              value: https://sso.bstein.dev/realms/atlas
+            - name: KEYCLOAK_JWKS_URL
+              value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs
+            - name: KEYCLOAK_ADMIN_URL
+              value: http://keycloak.sso.svc.cluster.local
+            - name: KEYCLOAK_ADMIN_REALM
+              value: atlas
+            - name: KEYCLOAK_ADMIN_CLIENT_ID
+              value: bstein-dev-home-admin
+            - name: PORTAL_PUBLIC_BASE_URL
+              value: https://bstein.dev
+            - name: ARIADNE_LOG_LEVEL
+              value: INFO
+            - name: ARIADNE_DB_POOL_MIN
+              value: "0"
+            - name: ARIADNE_DB_POOL_MAX
+              value: "5"
+            - name: ARIADNE_DB_CONNECT_TIMEOUT_SEC
+              value: "5"
+            - name: ARIADNE_DB_LOCK_TIMEOUT_SEC
+              value: "5"
+            - name: ARIADNE_DB_STATEMENT_TIMEOUT_SEC
+              value: "30"
+            - name: ARIADNE_DB_IDLE_IN_TX_TIMEOUT_SEC
+              value: "10"
+            - name: ARIADNE_RUN_MIGRATIONS
+              value: "false"
+            - name: PORTAL_ADMIN_USERS
+              value: bstein
+            - name: PORTAL_ADMIN_GROUPS
+              value: admin
+            - name: ACCOUNT_ALLOWED_GROUPS
+              value: dev,admin
+            - name: ALLOWED_FLAG_GROUPS
+              value: demo,test,vaultwarden_grandfathered
+            - name: DEFAULT_USER_GROUPS
+              value: dev
+            - name: MAILU_DOMAIN
+              value: bstein.dev
+            - name: MAILU_HOST
+              value: mail.bstein.dev
+            - name: MAILU_SYNC_URL
+              value: http://ariadne.maintenance.svc.cluster.local/events
+            - name: MAILU_EVENT_MIN_INTERVAL_SEC
+              value: "10"
+            - name: MAILU_SYSTEM_USERS
+              value: no-reply-portal@bstein.dev,no-reply-vaultwarden@bstein.dev
+            - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC
+              value: "180"
+            - name: MAILU_DB_HOST
+              value: postgres-service.postgres.svc.cluster.local
+            - name: MAILU_DB_PORT
+              value: "5432"
+            - name: NEXTCLOUD_NAMESPACE
+              value: nextcloud
+            - name: NEXTCLOUD_POD_LABEL
+              value: app=nextcloud
+            - name: NEXTCLOUD_CONTAINER
+              value: nextcloud
+            - name: NEXTCLOUD_EXEC_TIMEOUT_SEC
+              value: "120"
+            - name: NEXTCLOUD_URL
+              value: https://cloud.bstein.dev
+            - name: NEXTCLOUD_DB_HOST
+              value: postgres-service.postgres.svc.cluster.local
+            - name: NEXTCLOUD_DB_PORT
+              value: "5432"
+            - name: WGER_NAMESPACE
+              value: health
+            - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC
+              value: "90"
+            - name: WGER_POD_LABEL
+              value: app=wger
+            - name: WGER_CONTAINER
+              value: wger
+            - name: WGER_ADMIN_EMAIL
+              value: brad@bstein.dev
+            - name: FIREFLY_NAMESPACE
+              value: finance
+            - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC
+              value: "90"
+            - name: FIREFLY_POD_LABEL
+              value: app=firefly
+            - name: FIREFLY_CONTAINER
+              value: firefly
+            - name: FIREFLY_CRON_BASE_URL
+              value: http://firefly.finance.svc.cluster.local/api/v1/cron
+            - name: FIREFLY_CRON_TIMEOUT_SEC
+              value: "30"
+            - name: VAULT_NAMESPACE
+              value: vault
+            - name: VAULT_ADDR
+              value: http://vault.vault.svc.cluster.local:8200
+            - name: VAULT_K8S_ROLE
+              value: vault-admin
+            - name: VAULT_K8S_ROLE_TTL
+              value: 1h
+            - name: COMMS_NAMESPACE
+              value: comms
+            - name: COMMS_SYNAPSE_BASE
+              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
+            - name: COMMS_AUTH_BASE
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8080
+            - name: COMMS_MAS_ADMIN_API_BASE
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8081/api/admin/v1
+            - name: COMMS_MAS_TOKEN_URL
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8080/oauth2/token
+            - name: COMMS_MAS_ADMIN_CLIENT_ID
+              value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM
+            - name: COMMS_SERVER_NAME
+              value: live.bstein.dev
+            - name: COMMS_ROOM_ALIAS
+              value: "#othrys:live.bstein.dev"
+            - name: COMMS_ROOM_NAME
+              value: Othrys
+            - name: COMMS_PIN_MESSAGE
+              value: "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'."
+            - name: COMMS_SEEDER_USER
+              value: othrys-seeder
+            - name: COMMS_BOT_USER
+              value: atlasbot
+            - name: COMMS_SYNAPSE_DB_HOST
+              value: postgres-service.postgres.svc.cluster.local
+            - name: COMMS_SYNAPSE_DB_PORT
+              value: "5432"
+            - name: COMMS_SYNAPSE_DB_NAME
+              value: synapse
+            - name: COMMS_SYNAPSE_DB_USER
+              value: synapse
+            - name: COMMS_TIMEOUT_SEC
+              value: "30"
+            - name: COMMS_GUEST_STALE_DAYS
+              value: "14"
+            - name: VAULTWARDEN_NAMESPACE
+              value: vaultwarden
+            - name: VAULTWARDEN_POD_LABEL
+              value: app=vaultwarden
+            - name: VAULTWARDEN_POD_PORT
+              value: "80"
+            - name: VAULTWARDEN_SERVICE_HOST
+              value: vaultwarden-service.vaultwarden.svc.cluster.local
+            - name: VAULTWARDEN_ADMIN_SECRET_NAME
+              value: vaultwarden-admin
+            - name: VAULTWARDEN_ADMIN_SECRET_KEY
+              value: ADMIN_TOKEN
+            - name: VAULTWARDEN_ADMIN_SESSION_TTL_SEC
+              value: "900"
+            - name: VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC
+              value: "600"
+            - name: VAULTWARDEN_RETRY_COOLDOWN_SEC
+              value: "1800"
+            - name: VAULTWARDEN_FAILURE_BAILOUT
+              value: "2"
+            - name: ARIADNE_PROVISION_POLL_INTERVAL_SEC
+              value: "5"
+            - name: ARIADNE_PROVISION_RETRY_COOLDOWN_SEC
+              value: "30"
+            - name: ARIADNE_SCHEDULE_TICK_SEC
+              value: "5"
+            - name: ARIADNE_SCHEDULE_MAILU_SYNC
+              value: "30 4 * * *"
+            - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
+              value: "0 5 * * *"
+            - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
+              value: "*/5 * * * *"
+            - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
+              value: "30 4 * * *"
+            - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
+              value: "0 * * * *"
+            - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
+              value: "0 5 * * *"
+            - name: ARIADNE_SCHEDULE_WGER_ADMIN
+              value: "15 3 * * *"
+            - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
+              value: "0 6 * * *"
+            - name: ARIADNE_SCHEDULE_FIREFLY_CRON
+              value: "0 3 * * *"
+            - name: ARIADNE_SCHEDULE_POD_CLEANER
+              value: "0 * * * *"
+            - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
+              value: "23 3 * * *"
+            - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
+              value: "30 4 * * 0"
+            - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
+              value: "0 * * * *"
+            - name: ARIADNE_SCHEDULE_VAULT_OIDC
+              value: "0 * * * *"
+            - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
+              value: "*/5 * * * *"
+            - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
+              value: "0 0 1 * *"
+            - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM
+              value: "0 0 1 1 *"
+            - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
+              value: "*/10 * * * *"
+            - name: ARIADNE_SCHEDULE_CLUSTER_STATE
+              value: "*/15 * * * *"
+            - name: ARIADNE_CLUSTER_STATE_KEEP
+              value: "168"
+            - name: WELCOME_EMAIL_ENABLED
+              value: "true"
+            - name: K8S_API_TIMEOUT_SEC
+              value: "5"
+            - name: ARIADNE_VM_URL
+              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
+            - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
+              value: "5"
+            - name: OPENSEARCH_URL
+              value: http://opensearch-master.logging.svc.cluster.local:9200
+            - name: OPENSEARCH_LIMIT_BYTES
+              value: "1099511627776"
+            - name: OPENSEARCH_INDEX_PATTERNS
+              value: kube-*,journald-*,trace-analytics-*
+            - name: METRICS_PATH
+              value: "/metrics"
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 10
diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml
new file mode 100644
index 0000000..33620d0
--- /dev/null
+++ b/services/maintenance/ariadne-rbac.yaml
@@ -0,0 +1,58 @@
+# services/maintenance/ariadne-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: ariadne-job-spawner
+rules:
+  - apiGroups: ["batch"]
+    resources:
+      - jobs
+      - cronjobs
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+  - apiGroups: [""]
+    resources:
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+      - delete
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - namespaces
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups: [""]
+    resources:
+      - pods/exec
+    verbs:
+      - get
+      - create
+  - apiGroups: ["kustomize.toolkit.fluxcd.io"]
+    resources:
+      - kustomizations
+    verbs:
+      - get
+      - list
+      - watch
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: ariadne-job-spawner
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: ariadne-job-spawner
diff --git a/services/maintenance/ariadne-service.yaml b/services/maintenance/ariadne-service.yaml
new file mode 100644
index 0000000..9c93e1d
--- /dev/null
+++ b/services/maintenance/ariadne-service.yaml
@@ -0,0 +1,13 @@
+# services/maintenance/ariadne-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  selector:
+    app: ariadne
+  ports:
+    - name: http
+      port: 80
+      targetPort: http
diff --git a/services/maintenance/ariadne-serviceaccount.yaml b/services/maintenance/ariadne-serviceaccount.yaml
new file mode 100644
index 0000000..9adcef7
--- /dev/null
+++ b/services/maintenance/ariadne-serviceaccount.yaml
@@ -0,0 +1,8 @@
+# services/maintenance/ariadne-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ariadne
+  namespace: maintenance
+imagePullSecrets:
+  - name: harbor-regcred
diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml
index c94fcca..0039206 100644
--- a/services/maintenance/image-sweeper-cronjob.yaml
+++ b/services/maintenance/image-sweeper-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: maintenance
 spec:
   schedule: "30 4 * * 0"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 2
   failedJobsHistoryLimit: 2
diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml
new file mode 100644
index 0000000..fd28d90
--- /dev/null
+++ b/services/maintenance/image.yaml
@@ -0,0 +1,23 @@
+# services/maintenance/image.yaml
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImageRepository
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  image: registry.bstein.dev/bstein/ariadne
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImagePolicy
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  imageRepositoryRef:
+    name: ariadne
+  policy:
+    semver:
+      range: ">=0.1.0-0"
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index e53ed3c..19b2ba9 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -3,19 +3,30 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - namespace.yaml
+  - image.yaml
+  - secretproviderclass.yaml
+  - vault-serviceaccount.yaml
+  - vault-sync-deployment.yaml
+  - ariadne-serviceaccount.yaml
+  - ariadne-rbac.yaml
   - disable-k3s-traefik-serviceaccount.yaml
   - k3s-traefik-cleanup-rbac.yaml
   - node-nofile-serviceaccount.yaml
   - pod-cleaner-rbac.yaml
+  - ariadne-deployment.yaml
+  - oneoffs/ariadne-migrate-job.yaml
+  - ariadne-service.yaml
   - disable-k3s-traefik-daemonset.yaml
-  - k3s-traefik-cleanup-job.yaml
+  - oneoffs/k3s-traefik-cleanup-job.yaml
   - node-nofile-daemonset.yaml
   - k3s-agent-restart-daemonset.yaml
   - pod-cleaner-cronjob.yaml
   - node-image-sweeper-serviceaccount.yaml
   - node-image-sweeper-daemonset.yaml
   - image-sweeper-cronjob.yaml
-
+images:
+  - name: registry.bstein.dev/bstein/ariadne
+    newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance
diff --git a/services/maintenance/oneoffs/ariadne-migrate-job.yaml b/services/maintenance/oneoffs/ariadne-migrate-job.yaml
new file mode 100644
index 0000000..ecac68d
--- /dev/null
+++ b/services/maintenance/oneoffs/ariadne-migrate-job.yaml
@@ -0,0 +1,50 @@
+# services/maintenance/oneoffs/ariadne-migrate-job.yaml
+# One-off job for maintenance/ariadne-migrate-2.
+# Purpose: ariadne migrate 2 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ariadne-migrate-2
+  namespace: maintenance
+  annotations:
+    kustomize.toolkit.fluxcd.io/force: "true"
+spec:
+  suspend: true
+  backoffLimit: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      labels:
+        app: ariadne-migrate
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
+        vault.hashicorp.com/role: "maintenance"
+        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
+        vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
+          {{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
+          export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ end }}
+    spec:
+      serviceAccountName: ariadne
+      restartPolicy: Never
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: migrate
+          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-c"]
+          args:
+            - >-
+              . /vault/secrets/ariadne-env.sh
+              && exec python -m ariadne.migrate
+          env:
+            - name: ARIADNE_RUN_MIGRATIONS
+              value: "true"
diff --git a/services/maintenance/k3s-traefik-cleanup-job.yaml b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
similarity index 77%
rename from services/maintenance/k3s-traefik-cleanup-job.yaml
rename to services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
index d5d12a6..2c365a9 100644
--- a/services/maintenance/k3s-traefik-cleanup-job.yaml
+++ b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
@@ -1,10 +1,15 @@
-# services/maintenance/k3s-traefik-cleanup-job.yaml
+# services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
+# One-off job for maintenance/k3s-traefik-cleanup-2.
+# Purpose: k3s traefik cleanup 2 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: k3s-traefik-cleanup-2
   namespace: maintenance
 spec:
+  suspend: true
   backoffLimit: 1
   template:
     spec:
diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml
index e083c85..99d13f6 100644
--- a/services/maintenance/pod-cleaner-cronjob.yaml
+++ b/services/maintenance/pod-cleaner-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: maintenance
 spec:
   schedule: "0 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml
new file mode 100644
index 0000000..85df2af
--- /dev/null
+++ b/services/maintenance/secretproviderclass.yaml
@@ -0,0 +1,21 @@
+# services/maintenance/secretproviderclass.yaml
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: maintenance-vault
+  namespace: maintenance
+spec:
+  provider: vault
+  parameters:
+    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
+    roleName: "maintenance"
+    objects: |
+      - objectName: "harbor-pull__dockerconfigjson"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
+        secretKey: "dockerconfigjson"
+  secretObjects:
+    - secretName: harbor-regcred
+      type: kubernetes.io/dockerconfigjson
+      data:
+        - objectName: harbor-pull__dockerconfigjson
+          key: .dockerconfigjson
diff --git a/services/maintenance/vault-serviceaccount.yaml b/services/maintenance/vault-serviceaccount.yaml
new file mode 100644
index 0000000..f60b43e
--- /dev/null
+++ b/services/maintenance/vault-serviceaccount.yaml
@@ -0,0 +1,6 @@
+# services/maintenance/vault-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: maintenance-vault-sync
+  namespace: maintenance
diff --git a/services/maintenance/vault-sync-deployment.yaml b/services/maintenance/vault-sync-deployment.yaml
new file mode 100644
index 0000000..edc0456
--- /dev/null
+++ b/services/maintenance/vault-sync-deployment.yaml
@@ -0,0 +1,34 @@
+# services/maintenance/vault-sync-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: maintenance-vault-sync
+  namespace: maintenance
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: maintenance-vault-sync
+  template:
+    metadata:
+      labels:
+        app: maintenance-vault-sync
+    spec:
+      serviceAccountName: maintenance-vault-sync
+      containers:
+        - name: sync
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - "sleep infinity"
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /vault/secrets
+              readOnly: true
+      volumes:
+        - name: vault-secrets
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: maintenance-vault
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index af8a1c5..6f993d9 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -89,7 +89,7 @@
       },
       "targets": [
         {
-          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
+          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -126,7 +126,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+          "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{Hostname}}"
         }
diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json
new file mode 100644
index 0000000..37b888d
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-jobs.json
@@ -0,0 +1,1253 @@
+{
+  "uid": "atlas-jobs",
+  "title": "Atlas Jobs",
+  "folderUid": "atlas-internal",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "bargauge",
+      "title": "Ariadne Task Errors (range)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Ariadne Attempts / Failures",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 8,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
+          "refId": "A",
+          "legendFormat": "Attempts"
+        },
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+          "refId": "B",
+          "legendFormat": "Failures"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Attempts"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "green"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Failures"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "bargauge",
+      "title": "One-off Job Pods (age hours)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 16,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{pod}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        },
+        {
+          "id": "limit",
+          "options": {
+            "limit": 12
+          }
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Glue Jobs Stale (>36h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 2
+              },
+              {
+                "color": "red",
+                "value": 3
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "Glue Jobs Missing Success",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Glue Jobs Suspended",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 7,
+      "type": "stat",
+      "title": "Ariadne Task Errors (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 8,
+      "type": "stat",
+      "title": "Ariadne Task Errors (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 9,
+      "type": "stat",
+      "title": "Ariadne Task Runs (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 10,
+      "type": "bargauge",
+      "title": "Ariadne Schedule Last Error (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 17
+      },
+      "targets": [
+        {
+          "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 1
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "green",
+                "value": 24
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 11,
+      "type": "bargauge",
+      "title": "Ariadne Schedule Last Success (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 17
+      },
+      "targets": [
+        {
+          "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 12,
+      "type": "bargauge",
+      "title": "Glue Jobs Last Success (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 23
+      },
+      "targets": [
+        {
+          "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{cronjob}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 13,
+      "type": "bargauge",
+      "title": "Glue Jobs Last Schedule (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 23
+      },
+      "targets": [
+        {
+          "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{cronjob}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 14,
+      "type": "bargauge",
+      "title": "Ariadne Task Errors (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 29
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 15,
+      "type": "bargauge",
+      "title": "Ariadne Task Errors (30d)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 29
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 16,
+      "type": "bargauge",
+      "title": "Ariadne Access Requests",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 0,
+        "y": 11
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(ariadne_access_requests_total)",
+          "refId": "A",
+          "legendFormat": "{{status}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "orange",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 17,
+      "type": "stat",
+      "title": "Ariadne CI Coverage (%)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 4,
+        "x": 8,
+        "y": 11
+      },
+      "targets": [
+        {
+          "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+          "refId": "A",
+          "legendFormat": "{{branch}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 1
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 18,
+      "type": "table",
+      "title": "Ariadne CI Tests (latest)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 11
+      },
+      "targets": [
+        {
+          "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    }
+  ],
+  "time": {
+    "from": "now-7d",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "jobs",
+    "glue"
+  ]
+}
diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json
index 2d60042..ea59579 100644
--- a/services/monitoring/dashboards/atlas-nodes.json
+++ b/services/monitoring/dashboards/atlas-nodes.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
           "refId": "A"
         }
       ],
@@ -46,7 +46,7 @@
           "unit": "none",
           "custom": {
             "displayMode": "auto",
-            "valueSuffix": "/19"
+            "valueSuffix": "/20"
           }
         },
         "overrides": []
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index c5f30d1..1f8635b 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -449,14 +449,14 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
           "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
           "min": 0,
-          "max": 19,
+          "max": 20,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -466,15 +466,15 @@
               },
               {
                 "color": "orange",
-                "value": 17
-              },
-              {
-                "color": "yellow",
                 "value": 18
               },
               {
-                "color": "green",
+                "color": "yellow",
                 "value": 19
+              },
+              {
+                "color": "green",
+                "value": 20
               }
             ]
           }
@@ -795,8 +795,8 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
-        "w": 5,
+        "h": 3,
+        "w": 4,
         "x": 0,
         "y": 8
       },
@@ -862,9 +862,9 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
-        "w": 5,
-        "x": 10,
+        "h": 3,
+        "w": 4,
+        "x": 8,
         "y": 8
       },
       "targets": [
@@ -967,9 +967,9 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
-        "w": 5,
-        "x": 5,
+        "h": 3,
+        "w": 4,
+        "x": 4,
         "y": 8
       },
       "targets": [
@@ -1043,9 +1043,9 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
-        "w": 5,
-        "x": 15,
+        "h": 3,
+        "w": 4,
+        "x": 12,
         "y": 8
       },
       "targets": [
@@ -1110,6 +1110,132 @@
         }
       ]
     },
+    {
+      "id": 34,
+      "type": "stat",
+      "title": "Postgres Connections Used",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 4,
+        "x": 16,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")",
+          "refId": "A",
+          "legendFormat": "{{conn}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "name_and_value"
+      }
+    },
+    {
+      "id": 35,
+      "type": "stat",
+      "title": "Postgres Hottest Connections",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 4,
+        "x": 20,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "topk(1, sum by (datname) (pg_stat_activity_count))",
+          "refId": "A",
+          "legendFormat": "{{datname}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "name_and_value"
+      }
+    },
     {
       "id": 23,
       "type": "stat",
@@ -1119,10 +1245,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 3,
         "w": 6,
         "x": 0,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1194,10 +1320,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 3,
         "w": 6,
         "x": 6,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1269,10 +1395,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 3,
         "w": 6,
         "x": 12,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1336,10 +1462,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 3,
         "w": 6,
         "x": 18,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1394,6 +1520,302 @@
         }
       ]
     },
+    {
+      "id": 40,
+      "type": "bargauge",
+      "title": "One-off Job Pods (age hours)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 0,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{pod}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        },
+        {
+          "id": "limit",
+          "options": {
+            "limit": 8
+          }
+        }
+      ]
+    },
+    {
+      "id": 41,
+      "type": "timeseries",
+      "title": "Ariadne Attempts / Failures",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 6,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
+          "refId": "A",
+          "legendFormat": "Attempts"
+        },
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+          "refId": "B",
+          "legendFormat": "Failures"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Attempts"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "green"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Failures"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 42,
+      "type": "timeseries",
+      "title": "Ariadne Test Success Rate",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 12,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "max": 100
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 43,
+      "type": "bargauge",
+      "title": "Tests with Failures (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 6,
+        "x": 18,
+        "y": 14
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
+          "refId": "A",
+          "legendFormat": "{{result}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 5
+              },
+              {
+                "color": "red",
+                "value": 10
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "error"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "yellow"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "failed"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
     {
       "id": 11,
       "type": "piechart",
@@ -1406,7 +1828,7 @@
         "h": 9,
         "w": 8,
         "x": 0,
-        "y": 16
+        "y": 20
       },
       "targets": [
         {
@@ -1475,11 +1897,11 @@
         "h": 9,
         "w": 8,
         "x": 8,
-        "y": 16
+        "y": 20
       },
       "targets": [
         {
-          "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -1544,7 +1966,7 @@
         "h": 9,
         "w": 8,
         "x": 16,
-        "y": 16
+        "y": 20
       },
       "targets": [
         {
@@ -1613,11 +2035,11 @@
         "h": 12,
         "w": 12,
         "x": 0,
-        "y": 32
+        "y": 36
       },
       "targets": [
         {
-          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -1660,11 +2082,11 @@
         "h": 12,
         "w": 12,
         "x": 12,
-        "y": 32
+        "y": 36
       },
       "targets": [
         {
-          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -1707,7 +2129,7 @@
         "h": 10,
         "w": 12,
         "x": 0,
-        "y": 44
+        "y": 48
       },
       "targets": [
         {
@@ -1744,7 +2166,7 @@
         "h": 10,
         "w": 12,
         "x": 12,
-        "y": 44
+        "y": 48
       },
       "targets": [
         {
@@ -1781,7 +2203,7 @@
         "h": 10,
         "w": 12,
         "x": 0,
-        "y": 54
+        "y": 58
       },
       "targets": [
         {
@@ -1832,11 +2254,11 @@
         "h": 10,
         "w": 12,
         "x": 12,
-        "y": 54
+        "y": 58
       },
       "targets": [
         {
-          "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+          "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -1913,7 +2335,7 @@
         "h": 7,
         "w": 8,
         "x": 0,
-        "y": 25
+        "y": 29
       },
       "targets": [
         {
@@ -1957,7 +2379,7 @@
         "h": 7,
         "w": 8,
         "x": 8,
-        "y": 25
+        "y": 29
       },
       "targets": [
         {
@@ -2001,7 +2423,7 @@
         "h": 7,
         "w": 8,
         "x": 16,
-        "y": 25
+        "y": 29
       },
       "targets": [
         {
@@ -2045,7 +2467,7 @@
         "h": 16,
         "w": 12,
         "x": 0,
-        "y": 64
+        "y": 68
       },
       "targets": [
         {
@@ -2093,11 +2515,11 @@
         "h": 16,
         "w": 12,
         "x": 12,
-        "y": 64
+        "y": 68
       },
       "targets": [
         {
-          "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index adab84b..0c8104c 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -439,7 +439,7 @@
       },
       "targets": [
         {
-          "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+          "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -520,7 +520,7 @@
       },
       "targets": [
         {
-          "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))",
+          "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))",
           "refId": "A",
           "instant": true,
           "format": "table"
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json
deleted file mode 100644
index 25cf3f8..0000000
--- a/services/monitoring/dashboards/atlas-testing.json
+++ /dev/null
@@ -1,339 +0,0 @@
-{
-  "uid": "atlas-testing",
-  "title": "Atlas Testing",
-  "folderUid": "atlas-internal",
-  "editable": true,
-  "panels": [
-    {
-      "id": 1,
-      "type": "stat",
-      "title": "Glue Jobs Stale (>36h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 0,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1
-              },
-              {
-                "color": "orange",
-                "value": 2
-              },
-              {
-                "color": "red",
-                "value": 3
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 2,
-      "type": "table",
-      "title": "Glue Jobs Missing Success",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 6,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "none",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "type": "table",
-      "title": "Glue Jobs Suspended",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 12,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "none",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "type": "table",
-      "title": "Glue Jobs Active Runs",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 18,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "none",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 5,
-      "type": "table",
-      "title": "Glue Jobs Last Success (hours ago)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 4
-      },
-      "targets": [
-        {
-          "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "h",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 6,
-      "type": "table",
-      "title": "Glue Jobs Last Schedule (hours ago)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 4
-      },
-      "targets": [
-        {
-          "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "h",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    }
-  ],
-  "time": {
-    "from": "now-7d",
-    "to": "now"
-  },
-  "annotations": {
-    "list": []
-  },
-  "schemaVersion": 39,
-  "style": "dark",
-  "tags": [
-    "atlas",
-    "testing"
-  ]
-}
diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 8760c9f..ff5aed5 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -50,6 +50,10 @@ spec:
           env:
             - name: DCGM_EXPORTER_KUBERNETES
               value: "true"
+            - name: KUBERNETES_VIRTUAL_GPUS
+              value: "true"
+            - name: NVIDIA_RESOURCE_NAMES
+              value: nvidia.com/gpu.shared
           securityContext:
             privileged: true
           resources:
diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml
index daa1e29..33ac739 100644
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@@ -145,7 +145,7 @@ data:
                 model:
                   intervalMs: 60000
                   maxDataPoints: 43200
-                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m]
+                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
                   legendFormat: '{{instance}}'
                   datasource:
                     type: prometheus
@@ -175,11 +175,64 @@ data:
                         type: last
                       type: query
             noDataState: NoData
-            execErrState: Error
+            execErrState: OK
             annotations:
-              summary: "{{ $labels.instance }} CPU >90% for 10m"
+              summary: "{{ $labels.node }} CPU >90% for 10m"
             labels:
               severity: warning
+      - orgId: 1
+        name: atlas-metrics
+        folder: Alerts
+        interval: 1m
+        rules:
+          - uid: victoria-metrics-down
+            title: "VictoriaMetrics unavailable (>30m)"
+            condition: C
+            for: "30m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 600
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  expr: sum(up{job="victoriametrics"})
+                  legendFormat: victoriametrics
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: lt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: Alerting
+            execErrState: Alerting
+            annotations:
+              summary: "VictoriaMetrics is unavailable for >30m"
+            labels:
+              severity: critical
       - orgId: 1
         name: maintenance
         folder: Alerts
@@ -244,7 +297,7 @@ data:
                   to: 0
                 datasourceUid: atlas-vm
                 model:
-                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})
+                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
                   intervalMs: 60000
                   maxDataPoints: 43200
                   legendFormat: '{{cronjob}}'
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index d7950f2..3407963 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -98,7 +98,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
+              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -135,7 +135,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+              "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{Hostname}}"
             }
diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml
new file mode 100644
index 0000000..b16c9cb
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@@ -0,0 +1,1262 @@
+# services/monitoring/grafana-dashboard-jobs.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-jobs
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-jobs.json: |
+    {
+      "uid": "atlas-jobs",
+      "title": "Atlas Jobs",
+      "folderUid": "atlas-internal",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "bargauge",
+          "title": "Ariadne Task Errors (range)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 8,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 2,
+          "type": "timeseries",
+          "title": "Ariadne Attempts / Failures",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 8,
+            "x": 8,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
+              "refId": "A",
+              "legendFormat": "Attempts"
+            },
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+              "refId": "B",
+              "legendFormat": "Failures"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Attempts"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "green"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Failures"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "red"
+                    }
+                  }
+                ]
+              }
+            ]
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 3,
+          "type": "bargauge",
+          "title": "One-off Job Pods (age hours)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 8,
+            "x": 16,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{pod}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            },
+            {
+              "id": "limit",
+              "options": {
+                "limit": 12
+              }
+            }
+          ]
+        },
+        {
+          "id": 4,
+          "type": "stat",
+          "title": "Glue Jobs Stale (>36h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 0,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 5,
+          "type": "stat",
+          "title": "Glue Jobs Missing Success",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 4,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 6,
+          "type": "stat",
+          "title": "Glue Jobs Suspended",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 8,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 7,
+          "type": "stat",
+          "title": "Ariadne Task Errors (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 12,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 8,
+          "type": "stat",
+          "title": "Ariadne Task Errors (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 16,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 9,
+          "type": "stat",
+          "title": "Ariadne Task Runs (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 20,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 10,
+          "type": "bargauge",
+          "title": "Ariadne Schedule Last Error (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 17
+          },
+          "targets": [
+            {
+              "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 1
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "green",
+                    "value": 24
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 11,
+          "type": "bargauge",
+          "title": "Ariadne Schedule Last Success (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 17
+          },
+          "targets": [
+            {
+              "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 12,
+          "type": "bargauge",
+          "title": "Glue Jobs Last Success (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 23
+          },
+          "targets": [
+            {
+              "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{cronjob}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 13,
+          "type": "bargauge",
+          "title": "Glue Jobs Last Schedule (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 23
+          },
+          "targets": [
+            {
+              "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{cronjob}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 14,
+          "type": "bargauge",
+          "title": "Ariadne Task Errors (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 29
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 15,
+          "type": "bargauge",
+          "title": "Ariadne Task Errors (30d)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 29
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 16,
+          "type": "bargauge",
+          "title": "Ariadne Access Requests",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 8,
+            "x": 0,
+            "y": 11
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(ariadne_access_requests_total)",
+              "refId": "A",
+              "legendFormat": "{{status}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "orange",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 17,
+          "type": "stat",
+          "title": "Ariadne CI Coverage (%)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 4,
+            "x": 8,
+            "y": 11
+          },
+          "targets": [
+            {
+              "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+              "refId": "A",
+              "legendFormat": "{{branch}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 1
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 18,
+          "type": "table",
+          "title": "Ariadne CI Tests (latest)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 11
+          },
+          "targets": [
+            {
+              "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        }
+      ],
+      "time": {
+        "from": "now-7d",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "jobs",
+        "glue"
+      ]
+    }
diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml
index f0f1982..98123b9 100644
--- a/services/monitoring/grafana-dashboard-nodes.yaml
+++ b/services/monitoring/grafana-dashboard-nodes.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
               "refId": "A"
             }
           ],
@@ -55,7 +55,7 @@ data:
               "unit": "none",
               "custom": {
                 "displayMode": "auto",
-                "valueSuffix": "/19"
+                "valueSuffix": "/20"
               }
             },
             "overrides": []
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 8ad7523..fdfe1a7 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -458,14 +458,14 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
               "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
               "min": 0,
-              "max": 19,
+              "max": 20,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -475,15 +475,15 @@ data:
                   },
                   {
                     "color": "orange",
-                    "value": 17
-                  },
-                  {
-                    "color": "yellow",
                     "value": 18
                   },
                   {
-                    "color": "green",
+                    "color": "yellow",
                     "value": 19
+                  },
+                  {
+                    "color": "green",
+                    "value": 20
                   }
                 ]
               }
@@ -804,8 +804,8 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
-            "w": 5,
+            "h": 3,
+            "w": 4,
             "x": 0,
             "y": 8
           },
@@ -871,9 +871,9 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
-            "w": 5,
-            "x": 10,
+            "h": 3,
+            "w": 4,
+            "x": 8,
             "y": 8
           },
           "targets": [
@@ -976,9 +976,9 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
-            "w": 5,
-            "x": 5,
+            "h": 3,
+            "w": 4,
+            "x": 4,
             "y": 8
           },
           "targets": [
@@ -1052,9 +1052,9 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
-            "w": 5,
-            "x": 15,
+            "h": 3,
+            "w": 4,
+            "x": 12,
             "y": 8
           },
           "targets": [
@@ -1119,6 +1119,132 @@ data:
             }
           ]
         },
+        {
+          "id": 34,
+          "type": "stat",
+          "title": "Postgres Connections Used",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 4,
+            "x": 16,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")",
+              "refId": "A",
+              "legendFormat": "{{conn}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 0
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "name_and_value"
+          }
+        },
+        {
+          "id": 35,
+          "type": "stat",
+          "title": "Postgres Hottest Connections",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 4,
+            "x": 20,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "topk(1, sum by (datname) (pg_stat_activity_count))",
+              "refId": "A",
+              "legendFormat": "{{datname}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 0
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "name_and_value"
+          }
+        },
         {
           "id": 23,
           "type": "stat",
@@ -1128,10 +1254,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 3,
             "w": 6,
             "x": 0,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1203,10 +1329,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 3,
             "w": 6,
             "x": 6,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1278,10 +1404,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 3,
             "w": 6,
             "x": 12,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1345,10 +1471,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 3,
             "w": 6,
             "x": 18,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1403,6 +1529,302 @@ data:
             }
           ]
         },
+        {
+          "id": 40,
+          "type": "bargauge",
+          "title": "One-off Job Pods (age hours)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 0,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{pod}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            },
+            {
+              "id": "limit",
+              "options": {
+                "limit": 8
+              }
+            }
+          ]
+        },
+        {
+          "id": 41,
+          "type": "timeseries",
+          "title": "Ariadne Attempts / Failures",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 6,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
+              "refId": "A",
+              "legendFormat": "Attempts"
+            },
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+              "refId": "B",
+              "legendFormat": "Failures"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Attempts"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "green"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Failures"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "red"
+                    }
+                  }
+                ]
+              }
+            ]
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 42,
+          "type": "timeseries",
+          "title": "Ariadne Test Success Rate",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 12,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "max": 100
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 43,
+          "type": "bargauge",
+          "title": "Tests with Failures (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 18,
+            "y": 14
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
+              "refId": "A",
+              "legendFormat": "{{result}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 5
+                  },
+                  {
+                    "color": "red",
+                    "value": 10
+                  }
+                ]
+              }
+            },
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "error"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "yellow"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "failed"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "red"
+                    }
+                  }
+                ]
+              }
+            ]
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
         {
           "id": 11,
           "type": "piechart",
@@ -1415,7 +1837,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 0,
-            "y": 16
+            "y": 20
           },
           "targets": [
             {
@@ -1484,11 +1906,11 @@ data:
             "h": 9,
             "w": 8,
             "x": 8,
-            "y": 16
+            "y": 20
           },
           "targets": [
             {
-              "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -1553,7 +1975,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 16,
-            "y": 16
+            "y": 20
           },
           "targets": [
             {
@@ -1622,11 +2044,11 @@ data:
             "h": 12,
             "w": 12,
             "x": 0,
-            "y": 32
+            "y": 36
           },
           "targets": [
             {
-              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -1669,11 +2091,11 @@ data:
             "h": 12,
             "w": 12,
             "x": 12,
-            "y": 32
+            "y": 36
           },
           "targets": [
             {
-              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -1716,7 +2138,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 44
+            "y": 48
           },
           "targets": [
             {
@@ -1753,7 +2175,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 44
+            "y": 48
           },
           "targets": [
             {
@@ -1790,7 +2212,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 54
+            "y": 58
           },
           "targets": [
             {
@@ -1841,11 +2263,11 @@ data:
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 54
+            "y": 58
           },
           "targets": [
             {
-              "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+              "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -1922,7 +2344,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 0,
-            "y": 25
+            "y": 29
           },
           "targets": [
             {
@@ -1966,7 +2388,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 8,
-            "y": 25
+            "y": 29
           },
           "targets": [
             {
@@ -2010,7 +2432,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 16,
-            "y": 25
+            "y": 29
           },
           "targets": [
             {
@@ -2054,7 +2476,7 @@ data:
             "h": 16,
             "w": 12,
             "x": 0,
-            "y": 64
+            "y": 68
           },
           "targets": [
             {
@@ -2102,11 +2524,11 @@ data:
             "h": 16,
             "w": 12,
             "x": 12,
-            "y": 64
+            "y": 68
           },
           "targets": [
             {
-              "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index f537d4c..1461eac 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -448,7 +448,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+              "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -529,7 +529,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))",
+              "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))",
               "refId": "A",
               "instant": true,
               "format": "table"
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml
deleted file mode 100644
index 80a7043..0000000
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ /dev/null
@@ -1,348 +0,0 @@
-# services/monitoring/grafana-dashboard-testing.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-dashboard-testing
-  labels:
-    grafana_dashboard: "1"
-data:
-  atlas-testing.json: |
-    {
-      "uid": "atlas-testing",
-      "title": "Atlas Testing",
-      "folderUid": "atlas-internal",
-      "editable": true,
-      "panels": [
-        {
-          "id": 1,
-          "type": "stat",
-          "title": "Glue Jobs Stale (>36h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 6,
-            "x": 0,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 1
-                  },
-                  {
-                    "color": "orange",
-                    "value": 2
-                  },
-                  {
-                    "color": "red",
-                    "value": 3
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 2,
-          "type": "table",
-          "title": "Glue Jobs Missing Success",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 6,
-            "x": 6,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 3,
-          "type": "table",
-          "title": "Glue Jobs Suspended",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 6,
-            "x": 12,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 4,
-          "type": "table",
-          "title": "Glue Jobs Active Runs",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 6,
-            "x": 18,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 5,
-          "type": "table",
-          "title": "Glue Jobs Last Success (hours ago)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 4
-          },
-          "targets": [
-            {
-              "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "h",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 6,
-          "type": "table",
-          "title": "Glue Jobs Last Schedule (hours ago)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 4
-          },
-          "targets": [
-            {
-              "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "h",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        }
-      ],
-      "time": {
-        "from": "now-7d",
-        "to": "now"
-      },
-      "annotations": {
-        "list": []
-      },
-      "schemaVersion": 39,
-      "style": "dark",
-      "tags": [
-        "atlas",
-        "testing"
-      ]
-    }
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 304de05..6651738 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -286,6 +286,7 @@ spec:
     podAnnotations:
       vault.hashicorp.com/agent-inject: "true"
       vault.hashicorp.com/role: "monitoring"
+      monitoring.bstein.dev/restart-rev: "1"
       vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
       vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
         {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
@@ -339,10 +340,10 @@ spec:
       GF_AUTH_ANONYMOUS_ORG_NAME: "Overview"
       GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer"
       GF_SMTP_ENABLED: "true"
-      GF_SMTP_HOST: "mail.bstein.dev:587"
-      GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
+      GF_SMTP_HOST: "smtp.postmarkapp.com:587"
+      GF_SMTP_FROM_ADDRESS: "no-reply-grafana@bstein.dev"
       GF_SMTP_FROM_NAME: "Atlas Grafana"
-      GRAFANA_ALERT_EMAILS: "alerts@bstein.dev"
+      GRAFANA_ALERT_EMAILS: "brad@bstein.dev"
       GF_SECURITY_ALLOW_EMBEDDING: "true"
       GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
       GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"
@@ -354,6 +355,8 @@ spec:
       GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'"
       GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true"
       GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false"
+      GF_AUTH_GENERIC_OAUTH_ALLOW_INSECURE_EMAIL_LOOKUP: "true"
+      GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH: "email"
       GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/"
     grafana.ini:
       server:
@@ -469,14 +472,14 @@ spec:
             editable: true
             options:
               path: /var/lib/grafana/dashboards/mail
-          - name: testing
+          - name: jobs
             orgId: 1
             folder: Atlas Internal
             type: file
             disableDeletion: false
             editable: true
             options:
-              path: /var/lib/grafana/dashboards/testing
+              path: /var/lib/grafana/dashboards/jobs
     dashboardsConfigMaps:
       overview: grafana-dashboard-overview
       overview-public: grafana-dashboard-overview
@@ -486,7 +489,7 @@ spec:
       gpu: grafana-dashboard-gpu
       network: grafana-dashboard-network
       mail: grafana-dashboard-mail
-      testing: grafana-dashboard-testing
+      jobs: grafana-dashboard-jobs
     extraConfigmapMounts:
       - name: grafana-folders
         mountPath: /etc/grafana/provisioning/folders
diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index 8788b20..ba25c9f 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,6 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
+        monitoring.bstein.dev/restart-rev: "7"
     spec:
       serviceAccountName: default
       hostPID: true
@@ -44,6 +45,10 @@ spec:
           env:
             - name: JETSON_EXPORTER_PORT
               value: "9100"
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
           volumeMounts:
             - name: script
               mountPath: /etc/tegrastats-exporter
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 7d0b01b..23c1595 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -14,7 +14,7 @@ resources:
   - grafana-dashboard-network.yaml
   - grafana-dashboard-gpu.yaml
   - grafana-dashboard-mail.yaml
-  - grafana-dashboard-testing.yaml
+  - grafana-dashboard-jobs.yaml
   - dcgm-exporter.yaml
   - jetson-tegrastats-exporter.yaml
   - postmark-exporter-service.yaml
@@ -23,7 +23,8 @@ resources:
   - grafana-alerting-config.yaml
   - grafana-folders.yaml
   - helmrelease.yaml
-  - grafana-org-bootstrap.yaml
+  - oneoffs/grafana-org-bootstrap.yaml
+  - oneoffs/grafana-user-dedupe-job.yaml
 
 configMapGenerator:
   - name: postmark-exporter-script
diff --git a/services/monitoring/grafana-org-bootstrap.yaml b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml
similarity index 93%
rename from services/monitoring/grafana-org-bootstrap.yaml
rename to services/monitoring/oneoffs/grafana-org-bootstrap.yaml
index f1d4075..6f824cc 100644
--- a/services/monitoring/grafana-org-bootstrap.yaml
+++ b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml
@@ -1,10 +1,15 @@
-# services/monitoring/grafana-org-bootstrap.yaml
+# services/monitoring/oneoffs/grafana-org-bootstrap.yaml
+# One-off job for monitoring/grafana-org-bootstrap-3.
+# Purpose: grafana org bootstrap 3 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: grafana-org-bootstrap-3
   namespace: monitoring
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     metadata:
diff --git a/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
new file mode 100644
index 0000000..8194f18
--- /dev/null
+++ b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
@@ -0,0 +1,148 @@
+# services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
+# One-off job for monitoring/grafana-user-dedupe-api-v7.
+# Purpose: grafana user dedupe api v7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: grafana-user-dedupe-api-v7
+  namespace: monitoring
+spec:
+  suspend: true
+  backoffLimit: 1
+  template:
+    metadata:
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
+        vault.hashicorp.com/role: "monitoring"
+        vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
+        vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
+          {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
+          export GRAFANA_USER="{{ index .Data.data "admin-user" }}"
+          export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}"
+          {{ end }}
+    spec:
+      serviceAccountName: monitoring-vault-sync
+      automountServiceAccountToken: true
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["arm64"]
+      containers:
+        - name: dedupe
+          image: python:3.12-slim
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - |
+              set -euo pipefail
+              for _ in $(seq 1 30); do
+                if [ -f /vault/secrets/grafana-env.sh ]; then
+                  break
+                fi
+                sleep 1
+              done
+              if [ ! -f /vault/secrets/grafana-env.sh ]; then
+                echo "Vault secret not available"
+                exit 1
+              fi
+              . /vault/secrets/grafana-env.sh
+              grafana_url="${GRAFANA_URL}"
+              if [ -z "${grafana_url}" ]; then
+                echo "GRAFANA_URL is required"
+                exit 1
+              fi
+              if [ -z "${GRAFANA_USER}" ] || [ -z "${GRAFANA_PASSWORD}" ]; then
+                echo "Grafana admin credentials missing"
+                exit 1
+              fi
+              if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then
+                echo "GRAFANA_DEDUPE_EMAILS is required"
+                exit 1
+              fi
+              python - <<'PY'
+              import base64
+              import json
+              import os
+              import urllib.parse
+              import urllib.error
+              import urllib.request
+
+              grafana_url = os.environ["GRAFANA_URL"].rstrip("/")
+              user = os.environ["GRAFANA_USER"]
+              password = os.environ["GRAFANA_PASSWORD"]
+              lookups = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()]
+
+              token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8")
+              headers = {"Authorization": f"Basic {token}"}
+
+              def request(method: str, url: str):
+                  req = urllib.request.Request(url, headers=headers, method=method)
+                  try:
+                      with urllib.request.urlopen(req, timeout=10) as resp:
+                          return resp.status, resp.read()
+                  except urllib.error.HTTPError as err:
+                      body = err.read()
+                      return err.code, body
+
+              for _ in range(60):
+                  status, _ = request("GET", f"{grafana_url}/api/health")
+                  if status == 200:
+                      break
+              else:
+                  raise SystemExit("Grafana API did not become ready in time")
+
+              for lookup in lookups:
+                  search_url = f"{grafana_url}/api/users/search?query={urllib.parse.quote(lookup)}"
+                  status, body = request("GET", search_url)
+                  if status != 200:
+                      print(f"search failed for {lookup}: status={status} body={body.decode('utf-8', errors='ignore')}")
+                      continue
+                  payload = json.loads(body)
+                  users = payload.get("users", [])
+                  matches = [
+                      user
+                      for user in users
+                      if user.get("email", "").lower() == lookup.lower()
+                      or user.get("login", "").lower() == lookup.lower()
+                  ]
+                  if not matches:
+                      print(f"no grafana user found for {lookup}")
+                      continue
+                  for user in matches:
+                      user_id = user.get("id")
+                      if not user_id:
+                          continue
+                      print(f"deleting grafana user {user_id} ({user.get('email')})")
+                      delete_url = f"{grafana_url}/api/admin/users/{user_id}"
+                      del_status, del_body = request("DELETE", delete_url)
+                      if del_status not in (200, 202, 204):
+                          print(
+                              "delete failed for",
+                              user_id,
+                              "status",
+                              del_status,
+                              "body",
+                              del_body.decode("utf-8", errors="ignore"),
+                          )
+              PY
+              echo "done"
+          env:
+            - name: GRAFANA_URL
+              value: http://grafana.monitoring.svc.cluster.local
+            - name: GRAFANA_DEDUPE_EMAILS
+              value: brad.stein@gmail.com,brad@bstein.dev
diff --git a/services/monitoring/postmark-exporter-deployment.yaml b/services/monitoring/postmark-exporter-deployment.yaml
index 6406224..98791d9 100644
--- a/services/monitoring/postmark-exporter-deployment.yaml
+++ b/services/monitoring/postmark-exporter-deployment.yaml
@@ -18,9 +18,9 @@ spec:
         prometheus.io/path: "/metrics"
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "monitoring"
-        vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/monitoring/postmark-exporter"
+        vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/shared/postmark-relay"
         vault.hashicorp.com/agent-inject-template-postmark-env: |
-          {{- with secret "kv/data/atlas/monitoring/postmark-exporter" -}}
+          {{- with secret "kv/data/atlas/shared/postmark-relay" -}}
           export POSTMARK_SERVER_TOKEN="{{ index .Data.data "apikey" }}"
           export POSTMARK_SERVER_TOKEN_FALLBACK="{{ index .Data.data "apikey" }}"
           {{- if index .Data.data "sending-limit" }}
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index cd557e7..8b36111 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -3,53 +3,59 @@ import os
 import re
 import socketserver
 import subprocess
-import threading
 from time import time
 
 PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
-METRICS = {
+NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
+BASE_METRICS = {
     "gr3d_freq_percent": 0.0,
     "gpu_temp_c": 0.0,
     "cpu_temp_c": 0.0,
     "ram_used_mb": 0.0,
     "ram_total_mb": 0.0,
     "power_5v_in_mw": 0.0,
+    "log_line_len": 0.0,
     "last_scrape_ts": 0.0,
 }
-LOCK = threading.Lock()
 
-def parse_line(line: str):
+def parse_line(line: str) -> dict:
+    line = line.strip()
     updates = {}
-    m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
+    m = re.search(r"GR3D_FREQ\s+(\d+)%", line)
     if m:
         updates["gr3d_freq_percent"] = float(m.group(1))
-    m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
+    m = re.search(r"GPU@(\d+(?:\.\d+)?)C", line)
     if m:
         updates["gpu_temp_c"] = float(m.group(1))
-    m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
+    m = re.search(r"CPU@(\d+(?:\.\d+)?)C", line)
     if m:
         updates["cpu_temp_c"] = float(m.group(1))
-    m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
+    m = re.search(r"RAM\s+(\d+)/(\d+)MB", line)
     if m:
         updates["ram_used_mb"] = float(m.group(1))
         updates["ram_total_mb"] = float(m.group(2))
-    m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
+    m = re.search(r"(?:POM_5V_IN|VDD_IN)\s+(\d+)(?:mW)?/(\d+)(?:mW)?", line)
     if m:
         updates["power_5v_in_mw"] = float(m.group(1))
-    with LOCK:
-        METRICS.update(updates)
-        METRICS["last_scrape_ts"] = time()
+    return updates
 
-def run_tegrastats():
-    proc = subprocess.Popen(
-        ["/host/usr/bin/tegrastats", "--interval", "1000"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        bufsize=1,
-    )
-    for line in proc.stdout:
-        parse_line(line)
+def read_latest_line() -> str:
+    try:
+        proc = subprocess.Popen(
+            ["/host/usr/bin/tegrastats", "--interval", "1000"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        line = proc.stdout.readline()
+        proc.terminate()
+        try:
+            proc.wait(timeout=1)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+        return line
+    except OSError:
+        return ""
 
 class Handler(http.server.BaseHTTPRequestHandler):
     def do_GET(self):
@@ -57,13 +63,18 @@ class Handler(http.server.BaseHTTPRequestHandler):
             self.send_response(404)
             self.end_headers()
             return
-        with LOCK:
-            metrics = METRICS.copy()
+        metrics = BASE_METRICS.copy()
+        line = read_latest_line()
+        if line:
+            metrics.update(parse_line(line))
+            metrics["log_line_len"] = float(len(line))
+        metrics["last_scrape_ts"] = time()
         out = []
+        label = f'{{node="{NODE_NAME}"}}'
         for k, v in metrics.items():
             out.append(f"# TYPE jetson_{k} gauge")
-            out.append(f"jetson_{k} {v}")
-        body = "\\n".join(out) + "\\n"
+            out.append(f"jetson_{k}{label} {v}")
+        body = "\n".join(out) + "\n"
         self.send_response(200)
         self.send_header("Content-Type", "text/plain; version=0.0.4")
         self.send_header("Content-Length", str(len(body)))
@@ -74,7 +85,5 @@ class Handler(http.server.BaseHTTPRequestHandler):
         return
 
 if __name__ == "__main__":
-    t = threading.Thread(target=run_tegrastats, daemon=True)
-    t.start()
     with socketserver.TCPServer(("", PORT), Handler) as httpd:
         httpd.serve_forever()
diff --git a/services/monitoring/secretproviderclass.yaml b/services/monitoring/secretproviderclass.yaml
index 8a6c5fb..350d6aa 100644
--- a/services/monitoring/secretproviderclass.yaml
+++ b/services/monitoring/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "monitoring"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/monitoring"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml
index 2073d76..6913b60 100644
--- a/services/nextcloud-mail-sync/cronjob.yaml
+++ b/services/nextcloud-mail-sync/cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "0 5 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 3
   failedJobsHistoryLimit: 1
diff --git a/services/nextcloud-mail-sync/portal-rbac.yaml b/services/nextcloud-mail-sync/portal-rbac.yaml
index dc9a4e4..009b2e0 100644
--- a/services/nextcloud-mail-sync/portal-rbac.yaml
+++ b/services/nextcloud-mail-sync/portal-rbac.yaml
@@ -27,3 +27,16 @@ subjects:
   - kind: ServiceAccount
     name: bstein-dev-home
     namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-nextcloud-mail-sync
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-nextcloud-mail-sync
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/nextcloud/cronjob.yaml b/services/nextcloud/cronjob.yaml
index cc0091b..58d8aa1 100644
--- a/services/nextcloud/cronjob.yaml
+++ b/services/nextcloud/cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: nextcloud
 spec:
   schedule: "*/5 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   jobTemplate:
     spec:
diff --git a/services/nextcloud/maintenance-cronjob.yaml b/services/nextcloud/maintenance-cronjob.yaml
index d4008c7..177cc02 100644
--- a/services/nextcloud/maintenance-cronjob.yaml
+++ b/services/nextcloud/maintenance-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: nextcloud
 spec:
   schedule: "30 4 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   jobTemplate:
     spec:
diff --git a/services/pegasus/deployment.yaml b/services/pegasus/deployment.yaml
index bc3db70..b6a1639 100644
--- a/services/pegasus/deployment.yaml
+++ b/services/pegasus/deployment.yaml
@@ -72,7 +72,7 @@ spec:
 
       containers:
         - name: pegasus
-          image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus"}
+          image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"}
           imagePullPolicy: Always
           env:
             - name: PEGASUS_MEDIA_ROOT
diff --git a/services/pegasus/kustomization.yaml b/services/pegasus/kustomization.yaml
index bef2b40..05c3baa 100644
--- a/services/pegasus/kustomization.yaml
+++ b/services/pegasus/kustomization.yaml
@@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - configmap.yaml
+  - image.yaml
   - vault-serviceaccount.yaml
   - secretproviderclass.yaml
   - service.yaml
diff --git a/services/pegasus/secretproviderclass.yaml b/services/pegasus/secretproviderclass.yaml
index b4621a5..b8d1df9 100644
--- a/services/pegasus/secretproviderclass.yaml
+++ b/services/pegasus/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "pegasus"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/jellyfin"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml
index 29e8e80..5a2d682 100644
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
+  suspend: false
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
@@ -33,6 +34,11 @@ spec:
                   value: http://10.43.57.249:8200
                 - name: VAULT_K8S_ROLE
                   value: vault-admin
+                - name: VAULT_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: vault-init
+                      key: root_token
                 - name: VAULT_K8S_TOKEN_REVIEWER_JWT_FILE
                   value: /var/run/secrets/vault-token-reviewer/token
                 - name: VAULT_K8S_ROLE_TTL
diff --git a/services/vault/oidc-config-cronjob.yaml b/services/vault/oidc-config-cronjob.yaml
index 013c9f3..4d317c5 100644
--- a/services/vault/oidc-config-cronjob.yaml
+++ b/services/vault/oidc-config-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index 202879f..0212180 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -193,8 +193,8 @@ path "kv/data/atlas/shared/*" {
 write_raw_policy "dev-kv" "${dev_kv_policy}"
 log "writing role vault-admin"
 vault_cmd write "auth/kubernetes/role/vault-admin" \
-  bound_service_account_names="vault-admin" \
-  bound_service_account_namespaces="vault" \
+  bound_service_account_names="vault-admin,ariadne" \
+  bound_service_account_namespaces="vault,maintenance" \
   policies="vault-admin" \
   ttl="${role_ttl}"
 
@@ -203,40 +203,42 @@ write_policy_and_role "outline" "outline" "outline-vault" \
 write_policy_and_role "planka" "planka" "planka-vault" \
   "planka/* shared/postmark-relay" ""
 write_policy_and_role "bstein-dev-home" "bstein-dev-home" "bstein-dev-home,bstein-dev-home-vault-sync" \
-  "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret harbor-pull/bstein-dev-home" ""
+  "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret shared/harbor-pull" ""
 write_policy_and_role "gitea" "gitea" "gitea-vault" \
   "gitea/*" ""
 write_policy_and_role "vaultwarden" "vaultwarden" "vaultwarden-vault" \
   "vaultwarden/* mailu/mailu-initial-account-secret" ""
 write_policy_and_role "sso" "sso" "sso-vault,sso-vault-sync,mas-secrets-ensure" \
-  "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay harbor-pull/sso" ""
+  "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "mailu-mailserver" "mailu-mailserver" "mailu-vault-sync" \
-  "mailu/* shared/postmark-relay harbor-pull/mailu-mailserver" ""
+  "mailu/* shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "harbor" "harbor" "harbor-vault-sync" \
-  "harbor/* harbor-pull/harbor" ""
+  "harbor/* shared/harbor-pull" ""
 write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
   "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
-  "comms/* shared/chat-ai-keys-runtime harbor-pull/comms" ""
-write_policy_and_role "jenkins" "jenkins" "jenkins" \
-  "jenkins/*" ""
+  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
+write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
+  "jenkins/* shared/harbor-pull" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
-  "monitoring/* shared/postmark-relay harbor-pull/monitoring" ""
+  "monitoring/* shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "logging" "logging" "logging-vault-sync" \
-  "logging/* harbor-pull/logging" ""
+  "logging/* shared/harbor-pull" ""
 write_policy_and_role "pegasus" "jellyfin" "pegasus-vault-sync" \
-  "pegasus/* harbor-pull/jellyfin" ""
+  "pegasus/* shared/harbor-pull" ""
 write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
-  "crypto/* harbor-pull/crypto" ""
+  "crypto/* shared/harbor-pull" ""
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
+write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
+  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
   "" \
   "finance/*"
 write_policy_and_role "longhorn" "longhorn-system" "longhorn-vault,longhorn-vault-sync" \
-  "longhorn/* harbor-pull/longhorn" ""
+  "longhorn/* shared/harbor-pull" ""
 write_policy_and_role "postgres" "postgres" "postgres-vault" \
   "postgres/postgres-db" ""
 write_policy_and_role "vault" "vault" "vault" \
@@ -251,4 +253,4 @@ write_policy_and_role "crypto-secrets" "crypto" "crypto-secrets-ensure" \
 write_policy_and_role "comms-secrets" "comms" \
   "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job" \
   "" \
-  "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey"
+  "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey"
diff --git a/services/vaultwarden/ariadne-rbac.yaml b/services/vaultwarden/ariadne-rbac.yaml
new file mode 100644
index 0000000..ee903ca
--- /dev/null
+++ b/services/vaultwarden/ariadne-rbac.yaml
@@ -0,0 +1,28 @@
+# services/vaultwarden/ariadne-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: ariadne-vaultwarden-admin-reader
+  namespace: vaultwarden
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get"]
+    resourceNames: ["vaultwarden-admin"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-vaultwarden-admin-reader
+  namespace: vaultwarden
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: ariadne-vaultwarden-admin-reader
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/vaultwarden/deployment.yaml b/services/vaultwarden/deployment.yaml
index 2893a92..e1d888a 100644
--- a/services/vaultwarden/deployment.yaml
+++ b/services/vaultwarden/deployment.yaml
@@ -39,7 +39,7 @@ spec:
         node-role.kubernetes.io/worker: "true"
       containers:
         - name: vaultwarden
-          image: vaultwarden/server:1.33.2
+          image: vaultwarden/server:1.35.2
           command: ["/bin/sh", "-c"]
           args:
             - >-
diff --git a/services/vaultwarden/kustomization.yaml b/services/vaultwarden/kustomization.yaml
index c53cb1c..ca5ef26 100644
--- a/services/vaultwarden/kustomization.yaml
+++ b/services/vaultwarden/kustomization.yaml
@@ -5,6 +5,7 @@ namespace: vaultwarden
 resources:
   - namespace.yaml
   - serviceaccount.yaml
+  - ariadne-rbac.yaml
   - pvc.yaml
   - deployment.yaml
   - service.yaml