From 11a06e7683c07f43e2dd797de55d505648883e87 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 16:58:02 -0300
Subject: [PATCH 001/416] feat: add Ariadne service and glue scheduling

---
 scripts/dashboards_render_atlas.py            |  37 ++++
 .../vaultwarden-cred-sync-cronjob.yaml        |   1 +
 services/finance/portal-rbac.yaml             |  14 ++
 services/health/portal-rbac.yaml              |  16 +-
 .../health/wger-admin-ensure-cronjob.yaml     |   1 +
 services/keycloak/realm-settings-job.yaml     |   2 +
 services/mailu/mailu-sync-cronjob.yaml        |   1 +
 services/maintenance/ariadne-deployment.yaml  | 181 ++++++++++++++++++
 services/maintenance/ariadne-service.yaml     |  13 ++
 .../maintenance/ariadne-serviceaccount.yaml   |   8 +
 services/maintenance/kustomization.yaml       |   6 +
 services/maintenance/secretproviderclass.yaml |  21 ++
 .../maintenance/vault-serviceaccount.yaml     |   6 +
 .../maintenance/vault-sync-deployment.yaml    |  34 ++++
 .../monitoring/dashboards/atlas-testing.json  | 150 +++++++++++++++
 .../monitoring/grafana-dashboard-testing.yaml | 150 +++++++++++++++
 services/nextcloud-mail-sync/cronjob.yaml     |   1 +
 services/nextcloud-mail-sync/portal-rbac.yaml |  13 ++
 .../vault/scripts/vault_k8s_auth_configure.sh |   2 +
 services/vaultwarden/ariadne-rbac.yaml        |  28 +++
 services/vaultwarden/kustomization.yaml       |   1 +
 21 files changed, 685 insertions(+), 1 deletion(-)
 create mode 100644 services/maintenance/ariadne-deployment.yaml
 create mode 100644 services/maintenance/ariadne-service.yaml
 create mode 100644 services/maintenance/ariadne-serviceaccount.yaml
 create mode 100644 services/maintenance/secretproviderclass.yaml
 create mode 100644 services/maintenance/vault-serviceaccount.yaml
 create mode 100644 services/maintenance/vault-sync-deployment.yaml
 create mode 100644 services/vaultwarden/ariadne-rbac.yaml

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 0931b48b..116bf218 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -336,6 +336,10 @@ GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPE
 GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
 GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
 GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
+ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
+ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
+ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
+ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@@ -2230,6 +2234,39 @@ def build_testing_dashboard():
             instant=True,
         )
     )
+    panels.append(
+        table_panel(
+            7,
+            "Ariadne Task Errors (24h)",
+            ARIADNE_TASK_ERRORS_24H,
+            {"h": 6, "w": 12, "x": 0, "y": 12},
+            unit="none",
+            transformations=sort_desc,
+            instant=True,
+        )
+    )
+    panels.append(
+        table_panel(
+            8,
+            "Ariadne Schedule Last Success (hours ago)",
+            ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
+            {"h": 6, "w": 12, "x": 12, "y": 12},
+            unit="h",
+            transformations=sort_desc,
+            instant=True,
+        )
+    )
+    panels.append(
+        table_panel(
+            9,
+            "Ariadne Access Requests",
+            ARIADNE_ACCESS_REQUESTS,
+            {"h": 4, "w": 24, "x": 0, "y": 18},
+            unit="none",
+            transformations=sort_desc,
+            instant=True,
+        )
+    )
 
     return {
         "uid": "atlas-testing",
diff --git a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
index 29141fe4..acd851b1 100644
--- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
+++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/finance/portal-rbac.yaml b/services/finance/portal-rbac.yaml
index 2fb7eded..66eafea9 100644
--- a/services/finance/portal-rbac.yaml
+++ b/services/finance/portal-rbac.yaml
@@ -29,3 +29,17 @@ subjects:
   - kind: ServiceAccount
     name: bstein-dev-home
     namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-firefly-user-sync
+  namespace: finance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-firefly-user-sync
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/health/portal-rbac.yaml b/services/health/portal-rbac.yaml
index cd9acd19..feb74414 100644
--- a/services/health/portal-rbac.yaml
+++ b/services/health/portal-rbac.yaml
@@ -8,7 +8,7 @@ rules:
   - apiGroups: ["batch"]
     resources: ["cronjobs"]
     verbs: ["get"]
-    resourceNames: ["wger-user-sync"]
+    resourceNames: ["wger-user-sync", "wger-admin-ensure"]
   - apiGroups: ["batch"]
     resources: ["jobs"]
     verbs: ["create", "get", "list", "watch"]
@@ -29,3 +29,17 @@ subjects:
   - kind: ServiceAccount
     name: bstein-dev-home
     namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-wger-user-sync
+  namespace: health
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-wger-user-sync
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/health/wger-admin-ensure-cronjob.yaml b/services/health/wger-admin-ensure-cronjob.yaml
index db178a30..a1063dd9 100644
--- a/services/health/wger-admin-ensure-cronjob.yaml
+++ b/services/health/wger-admin-ensure-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "15 3 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index f6802005..a0b36ec5 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -331,6 +331,8 @@ spec:
               # Ensure basic realm groups exist for provisioning.
               ensure_group("dev")
               ensure_group("admin")
+              ensure_group("demo")
+              ensure_group("test")
               planka_group = ensure_group("planka-users")
 
               if planka_group and planka_group.get("id"):
diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml
index 1da19810..671439d5 100644
--- a/services/mailu/mailu-sync-cronjob.yaml
+++ b/services/mailu/mailu-sync-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "30 4 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   jobTemplate:
     spec:
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
new file mode 100644
index 00000000..fd2fb797
--- /dev/null
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -0,0 +1,181 @@
+# services/maintenance/ariadne-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  replicas: 1
+  revisionHistoryLimit: 3
+  selector:
+    matchLabels:
+      app: ariadne
+  template:
+    metadata:
+      labels:
+        app: ariadne
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/role: "maintenance"
+        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/portal/atlas-portal-db"
+        vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }}
+          export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }}
+          export MAILU_DB_NAME="{{ .Data.data.database }}"
+          export MAILU_DB_USER="{{ .Data.data.username }}"
+          export MAILU_DB_PASSWORD="{{ .Data.data.password }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/mailu/mailu-initial-account-secret" }}
+          export SMTP_HOST="mailu-front.mailu-mailserver.svc.cluster.local"
+          export SMTP_PORT="587"
+          export SMTP_STARTTLS="true"
+          export SMTP_USE_TLS="false"
+          export SMTP_USERNAME="no-reply-portal@bstein.dev"
+          export SMTP_PASSWORD="{{ .Data.data.password }}"
+          export SMTP_FROM="no-reply-portal@bstein.dev"
+          {{ end }}
+    spec:
+      serviceAccountName: ariadne
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: ariadne
+          image: registry.bstein.dev/bstein/ariadne:0.1.0
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-c"]
+          args:
+            - >-
+              . /vault/secrets/ariadne-env.sh
+              && exec uvicorn ariadne.app:app --host 0.0.0.0 --port 8080
+          ports:
+            - name: http
+              containerPort: 8080
+          env:
+            - name: KEYCLOAK_URL
+              value: https://sso.bstein.dev
+            - name: KEYCLOAK_REALM
+              value: atlas
+            - name: KEYCLOAK_CLIENT_ID
+              value: bstein-dev-home
+            - name: KEYCLOAK_ISSUER
+              value: https://sso.bstein.dev/realms/atlas
+            - name: KEYCLOAK_JWKS_URL
+              value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs
+            - name: KEYCLOAK_ADMIN_URL
+              value: http://keycloak.sso.svc.cluster.local
+            - name: KEYCLOAK_ADMIN_REALM
+              value: atlas
+            - name: KEYCLOAK_ADMIN_CLIENT_ID
+              value: bstein-dev-home-admin
+            - name: PORTAL_PUBLIC_BASE_URL
+              value: https://bstein.dev
+            - name: PORTAL_ADMIN_USERS
+              value: bstein
+            - name: PORTAL_ADMIN_GROUPS
+              value: admin
+            - name: ACCOUNT_ALLOWED_GROUPS
+              value: dev,admin
+            - name: ALLOWED_FLAG_GROUPS
+              value: demo,test
+            - name: DEFAULT_USER_GROUPS
+              value: dev
+            - name: MAILU_DOMAIN
+              value: bstein.dev
+            - name: MAILU_SYNC_URL
+              value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
+            - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC
+              value: "60"
+            - name: MAILU_DB_HOST
+              value: postgres-service.postgres.svc.cluster.local
+            - name: MAILU_DB_PORT
+              value: "5432"
+            - name: NEXTCLOUD_NAMESPACE
+              value: nextcloud
+            - name: NEXTCLOUD_MAIL_SYNC_CRONJOB
+              value: nextcloud-mail-sync
+            - name: NEXTCLOUD_MAIL_SYNC_WAIT_TIMEOUT_SEC
+              value: "90"
+            - name: NEXTCLOUD_MAIL_SYNC_JOB_TTL_SEC
+              value: "3600"
+            - name: WGER_NAMESPACE
+              value: health
+            - name: WGER_USER_SYNC_CRONJOB
+              value: wger-user-sync
+            - name: WGER_ADMIN_CRONJOB
+              value: wger-admin-ensure
+            - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC
+              value: "90"
+            - name: FIREFLY_NAMESPACE
+              value: finance
+            - name: FIREFLY_USER_SYNC_CRONJOB
+              value: firefly-user-sync
+            - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC
+              value: "90"
+            - name: VAULTWARDEN_NAMESPACE
+              value: vaultwarden
+            - name: VAULTWARDEN_POD_LABEL
+              value: app=vaultwarden
+            - name: VAULTWARDEN_POD_PORT
+              value: "80"
+            - name: VAULTWARDEN_SERVICE_HOST
+              value: vaultwarden-service.vaultwarden.svc.cluster.local
+            - name: VAULTWARDEN_ADMIN_SECRET_NAME
+              value: vaultwarden-admin
+            - name: VAULTWARDEN_ADMIN_SECRET_KEY
+              value: ADMIN_TOKEN
+            - name: VAULTWARDEN_ADMIN_SESSION_TTL_SEC
+              value: "900"
+            - name: VAULTWARDEN_ADMIN_RATE_LIMIT_BACKOFF_SEC
+              value: "600"
+            - name: VAULTWARDEN_RETRY_COOLDOWN_SEC
+              value: "1800"
+            - name: VAULTWARDEN_FAILURE_BAILOUT
+              value: "2"
+            - name: ARIADNE_PROVISION_POLL_INTERVAL_SEC
+              value: "5"
+            - name: ARIADNE_PROVISION_RETRY_COOLDOWN_SEC
+              value: "30"
+            - name: ARIADNE_SCHEDULE_TICK_SEC
+              value: "5"
+            - name: ARIADNE_SCHEDULE_MAILU_SYNC
+              value: "30 4 * * *"
+            - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
+              value: "0 5 * * *"
+            - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
+              value: "*/15 * * * *"
+            - name: ARIADNE_SCHEDULE_WGER_ADMIN
+              value: "15 3 * * *"
+            - name: WELCOME_EMAIL_ENABLED
+              value: "true"
+            - name: K8S_API_TIMEOUT_SEC
+              value: "5"
+            - name: METRICS_PATH
+              value: "/metrics"
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 10
diff --git a/services/maintenance/ariadne-service.yaml b/services/maintenance/ariadne-service.yaml
new file mode 100644
index 00000000..9c93e1df
--- /dev/null
+++ b/services/maintenance/ariadne-service.yaml
@@ -0,0 +1,13 @@
+# services/maintenance/ariadne-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  selector:
+    app: ariadne
+  ports:
+    - name: http
+      port: 80
+      targetPort: http
diff --git a/services/maintenance/ariadne-serviceaccount.yaml b/services/maintenance/ariadne-serviceaccount.yaml
new file mode 100644
index 00000000..9adcef7e
--- /dev/null
+++ b/services/maintenance/ariadne-serviceaccount.yaml
@@ -0,0 +1,8 @@
+# services/maintenance/ariadne-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ariadne
+  namespace: maintenance
+imagePullSecrets:
+  - name: harbor-regcred
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index e53ed3c7..f0f3de52 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -3,10 +3,16 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - namespace.yaml
+  - secretproviderclass.yaml
+  - vault-serviceaccount.yaml
+  - vault-sync-deployment.yaml
+  - ariadne-serviceaccount.yaml
   - disable-k3s-traefik-serviceaccount.yaml
   - k3s-traefik-cleanup-rbac.yaml
   - node-nofile-serviceaccount.yaml
   - pod-cleaner-rbac.yaml
+  - ariadne-deployment.yaml
+  - ariadne-service.yaml
   - disable-k3s-traefik-daemonset.yaml
   - k3s-traefik-cleanup-job.yaml
   - node-nofile-daemonset.yaml
diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml
new file mode 100644
index 00000000..dd959480
--- /dev/null
+++ b/services/maintenance/secretproviderclass.yaml
@@ -0,0 +1,21 @@
+# services/maintenance/secretproviderclass.yaml
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: maintenance-vault
+  namespace: maintenance
+spec:
+  provider: vault
+  parameters:
+    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
+    roleName: "maintenance"
+    objects: |
+      - objectName: "harbor-pull__dockerconfigjson"
+        secretPath: "kv/data/atlas/harbor-pull/maintenance"
+        secretKey: "dockerconfigjson"
+  secretObjects:
+    - secretName: harbor-regcred
+      type: kubernetes.io/dockerconfigjson
+      data:
+        - objectName: harbor-pull__dockerconfigjson
+          key: .dockerconfigjson
diff --git a/services/maintenance/vault-serviceaccount.yaml b/services/maintenance/vault-serviceaccount.yaml
new file mode 100644
index 00000000..f60b43ec
--- /dev/null
+++ b/services/maintenance/vault-serviceaccount.yaml
@@ -0,0 +1,6 @@
+# services/maintenance/vault-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: maintenance-vault-sync
+  namespace: maintenance
diff --git a/services/maintenance/vault-sync-deployment.yaml b/services/maintenance/vault-sync-deployment.yaml
new file mode 100644
index 00000000..edc04561
--- /dev/null
+++ b/services/maintenance/vault-sync-deployment.yaml
@@ -0,0 +1,34 @@
+# services/maintenance/vault-sync-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: maintenance-vault-sync
+  namespace: maintenance
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: maintenance-vault-sync
+  template:
+    metadata:
+      labels:
+        app: maintenance-vault-sync
+    spec:
+      serviceAccountName: maintenance-vault-sync
+      containers:
+        - name: sync
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - "sleep infinity"
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /vault/secrets
+              readOnly: true
+      volumes:
+        - name: vault-secrets
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: maintenance-vault
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json
index 25cf3f83..c9c0c9ab 100644
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@@ -321,6 +321,156 @@
           }
         }
       ]
+    },
+    {
+      "id": 7,
+      "type": "table",
+      "title": "Ariadne Task Errors (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      },
+      "targets": [
+        {
+          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "type": "table",
+      "title": "Ariadne Schedule Last Success (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "targets": [
+        {
+          "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 9,
+      "type": "table",
+      "title": "Ariadne Access Requests",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 24,
+        "x": 0,
+        "y": 18
+      },
+      "targets": [
+        {
+          "expr": "ariadne_access_requests_total",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
     }
   ],
   "time": {
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml
index 80a70438..7746f165 100644
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@@ -330,6 +330,156 @@ data:
               }
             }
           ]
+        },
+        {
+          "id": 7,
+          "type": "table",
+          "title": "Ariadne Task Errors (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 12
+          },
+          "targets": [
+            {
+              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 8,
+          "type": "table",
+          "title": "Ariadne Schedule Last Success (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 12
+          },
+          "targets": [
+            {
+              "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 9,
+          "type": "table",
+          "title": "Ariadne Access Requests",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 24,
+            "x": 0,
+            "y": 18
+          },
+          "targets": [
+            {
+              "expr": "ariadne_access_requests_total",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
         }
       ],
       "time": {
diff --git a/services/nextcloud-mail-sync/cronjob.yaml b/services/nextcloud-mail-sync/cronjob.yaml
index 2073d76e..6913b603 100644
--- a/services/nextcloud-mail-sync/cronjob.yaml
+++ b/services/nextcloud-mail-sync/cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "0 5 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 3
   failedJobsHistoryLimit: 1
diff --git a/services/nextcloud-mail-sync/portal-rbac.yaml b/services/nextcloud-mail-sync/portal-rbac.yaml
index dc9a4e4b..009b2e08 100644
--- a/services/nextcloud-mail-sync/portal-rbac.yaml
+++ b/services/nextcloud-mail-sync/portal-rbac.yaml
@@ -27,3 +27,16 @@ subjects:
   - kind: ServiceAccount
     name: bstein-dev-home
     namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-nextcloud-mail-sync
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-nextcloud-mail-sync
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index 202879f4..ca94ac66 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -230,6 +230,8 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
   "crypto/* harbor-pull/crypto" ""
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
+write_policy_and_role "maintenance" "maintenance" "ariadne" \
+  "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret harbor-pull/maintenance" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
diff --git a/services/vaultwarden/ariadne-rbac.yaml b/services/vaultwarden/ariadne-rbac.yaml
new file mode 100644
index 00000000..ee903ca8
--- /dev/null
+++ b/services/vaultwarden/ariadne-rbac.yaml
@@ -0,0 +1,28 @@
+# services/vaultwarden/ariadne-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: ariadne-vaultwarden-admin-reader
+  namespace: vaultwarden
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get"]
+    resourceNames: ["vaultwarden-admin"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: ariadne-vaultwarden-admin-reader
+  namespace: vaultwarden
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: ariadne-vaultwarden-admin-reader
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
diff --git a/services/vaultwarden/kustomization.yaml b/services/vaultwarden/kustomization.yaml
index c53cb1c6..ca5ef269 100644
--- a/services/vaultwarden/kustomization.yaml
+++ b/services/vaultwarden/kustomization.yaml
@@ -5,6 +5,7 @@ namespace: vaultwarden
 resources:
   - namespace.yaml
   - serviceaccount.yaml
+  - ariadne-rbac.yaml
   - pvc.yaml
   - deployment.yaml
   - service.yaml

From f3620aa2a4e04ed5b186769963e1915001372c90 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 19:02:14 -0300
Subject: [PATCH 002/416] chore: centralize harbor pull credentials

---
 .../longhorn/core/secretproviderclass.yaml    |  2 +-
 .../bstein-dev-home/secretproviderclass.yaml  |  2 +-
 services/comms/secretproviderclass.yaml       |  2 +-
 .../crypto/xmr-miner/secretproviderclass.yaml |  2 +-
 services/harbor/secretproviderclass.yaml      |  2 +-
 services/keycloak/secretproviderclass.yaml    |  2 +-
 services/logging/secretproviderclass.yaml     |  2 +-
 services/mailu/secretproviderclass.yaml       |  2 +-
 services/maintenance/ariadne-deployment.yaml  |  2 +-
 services/maintenance/image.yaml               | 21 ++++++++++++++++++
 services/maintenance/kustomization.yaml       |  5 +++++
 services/maintenance/secretproviderclass.yaml |  2 +-
 services/monitoring/secretproviderclass.yaml  |  2 +-
 services/pegasus/secretproviderclass.yaml     |  2 +-
 .../vault/scripts/vault_k8s_auth_configure.sh | 22 +++++++++----------
 15 files changed, 49 insertions(+), 23 deletions(-)
 create mode 100644 services/maintenance/image.yaml

diff --git a/infrastructure/longhorn/core/secretproviderclass.yaml b/infrastructure/longhorn/core/secretproviderclass.yaml
index 031d1d8a..e292b86a 100644
--- a/infrastructure/longhorn/core/secretproviderclass.yaml
+++ b/infrastructure/longhorn/core/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "longhorn"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/longhorn"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: longhorn-registry
diff --git a/services/bstein-dev-home/secretproviderclass.yaml b/services/bstein-dev-home/secretproviderclass.yaml
index f330fe68..2fa714a9 100644
--- a/services/bstein-dev-home/secretproviderclass.yaml
+++ b/services/bstein-dev-home/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "bstein-dev-home"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/bstein-dev-home"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/comms/secretproviderclass.yaml b/services/comms/secretproviderclass.yaml
index 69d4b2b3..0a895527 100644
--- a/services/comms/secretproviderclass.yaml
+++ b/services/comms/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "comms"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/comms"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/crypto/xmr-miner/secretproviderclass.yaml b/services/crypto/xmr-miner/secretproviderclass.yaml
index a72097fc..12e4ba19 100644
--- a/services/crypto/xmr-miner/secretproviderclass.yaml
+++ b/services/crypto/xmr-miner/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "crypto"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/crypto"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/harbor/secretproviderclass.yaml b/services/harbor/secretproviderclass.yaml
index 03fef95a..636f6fa8 100644
--- a/services/harbor/secretproviderclass.yaml
+++ b/services/harbor/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "harbor"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/harbor"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/keycloak/secretproviderclass.yaml b/services/keycloak/secretproviderclass.yaml
index 86cebd24..d4c094f2 100644
--- a/services/keycloak/secretproviderclass.yaml
+++ b/services/keycloak/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "sso"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/sso"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/logging/secretproviderclass.yaml b/services/logging/secretproviderclass.yaml
index f5db15ee..6ff642d2 100644
--- a/services/logging/secretproviderclass.yaml
+++ b/services/logging/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "logging"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/logging"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/mailu/secretproviderclass.yaml b/services/mailu/secretproviderclass.yaml
index f58c69b3..f9e281e5 100644
--- a/services/mailu/secretproviderclass.yaml
+++ b/services/mailu/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "mailu-mailserver"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/mailu-mailserver"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index fd2fb797..ee4884da 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -49,7 +49,7 @@ spec:
         node-role.kubernetes.io/worker: "true"
       containers:
         - name: ariadne
-          image: registry.bstein.dev/bstein/ariadne:0.1.0
+          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
           imagePullPolicy: Always
           command: ["/bin/sh", "-c"]
           args:
diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml
new file mode 100644
index 00000000..95acbd0b
--- /dev/null
+++ b/services/maintenance/image.yaml
@@ -0,0 +1,21 @@
+# services/maintenance/image.yaml
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImageRepository
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  image: registry.bstein.dev/bstein/ariadne
+  interval: 1m0s
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImagePolicy
+metadata:
+  name: ariadne
+  namespace: maintenance
+spec:
+  imageRepositoryRef:
+    name: ariadne
+  policy:
+    semver:
+      range: ">=0.1.0-0"
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index f0f3de52..5e199a98 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - namespace.yaml
+  - image.yaml
   - secretproviderclass.yaml
   - vault-serviceaccount.yaml
   - vault-sync-deployment.yaml
@@ -22,6 +23,10 @@ resources:
   - node-image-sweeper-daemonset.yaml
   - image-sweeper-cronjob.yaml
 
+images:
+  - name: registry.bstein.dev/bstein/ariadne
+    newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:ariadne"}
+
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance
diff --git a/services/maintenance/secretproviderclass.yaml b/services/maintenance/secretproviderclass.yaml
index dd959480..85df2af5 100644
--- a/services/maintenance/secretproviderclass.yaml
+++ b/services/maintenance/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "maintenance"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/maintenance"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/monitoring/secretproviderclass.yaml b/services/monitoring/secretproviderclass.yaml
index 8a6c5fbb..350d6aa3 100644
--- a/services/monitoring/secretproviderclass.yaml
+++ b/services/monitoring/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "monitoring"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/monitoring"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/pegasus/secretproviderclass.yaml b/services/pegasus/secretproviderclass.yaml
index b4621a57..b8d1df96 100644
--- a/services/pegasus/secretproviderclass.yaml
+++ b/services/pegasus/secretproviderclass.yaml
@@ -11,7 +11,7 @@ spec:
     roleName: "pegasus"
     objects: |
       - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/harbor-pull/jellyfin"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
         secretKey: "dockerconfigjson"
   secretObjects:
     - secretName: harbor-regcred
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index ca94ac66..c7eaf859 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -203,42 +203,42 @@ write_policy_and_role "outline" "outline" "outline-vault" \
 write_policy_and_role "planka" "planka" "planka-vault" \
   "planka/* shared/postmark-relay" ""
 write_policy_and_role "bstein-dev-home" "bstein-dev-home" "bstein-dev-home,bstein-dev-home-vault-sync" \
-  "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret harbor-pull/bstein-dev-home" ""
+  "portal/* shared/chat-ai-keys-runtime shared/portal-e2e-client shared/postmark-relay mailu/mailu-initial-account-secret shared/harbor-pull" ""
 write_policy_and_role "gitea" "gitea" "gitea-vault" \
   "gitea/*" ""
 write_policy_and_role "vaultwarden" "vaultwarden" "vaultwarden-vault" \
   "vaultwarden/* mailu/mailu-initial-account-secret" ""
 write_policy_and_role "sso" "sso" "sso-vault,sso-vault-sync,mas-secrets-ensure" \
-  "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay harbor-pull/sso" ""
+  "sso/* portal/bstein-dev-home-keycloak-admin shared/keycloak-admin shared/portal-e2e-client shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "mailu-mailserver" "mailu-mailserver" "mailu-vault-sync" \
-  "mailu/* shared/postmark-relay harbor-pull/mailu-mailserver" ""
+  "mailu/* shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "harbor" "harbor" "harbor-vault-sync" \
-  "harbor/* harbor-pull/harbor" ""
+  "harbor/* shared/harbor-pull" ""
 write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
   "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
-  "comms/* shared/chat-ai-keys-runtime harbor-pull/comms" ""
+  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
 write_policy_and_role "jenkins" "jenkins" "jenkins" \
   "jenkins/*" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
-  "monitoring/* shared/postmark-relay harbor-pull/monitoring" ""
+  "monitoring/* shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "logging" "logging" "logging-vault-sync" \
-  "logging/* harbor-pull/logging" ""
+  "logging/* shared/harbor-pull" ""
 write_policy_and_role "pegasus" "jellyfin" "pegasus-vault-sync" \
-  "pegasus/* harbor-pull/jellyfin" ""
+  "pegasus/* shared/harbor-pull" ""
 write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
-  "crypto/* harbor-pull/crypto" ""
+  "crypto/* shared/harbor-pull" ""
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne" \
-  "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret harbor-pull/maintenance" ""
+  "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
   "" \
   "finance/*"
 write_policy_and_role "longhorn" "longhorn-system" "longhorn-vault,longhorn-vault-sync" \
-  "longhorn/* harbor-pull/longhorn" ""
+  "longhorn/* shared/harbor-pull" ""
 write_policy_and_role "postgres" "postgres" "postgres-vault" \
   "postgres/postgres-db" ""
 write_policy_and_role "vault" "vault" "vault" \

From a6b317097e172079ed89099bed139205d4f0b296 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 19:07:00 -0300
Subject: [PATCH 003/416] fix: allow maintenance vault sync role

---
 services/vault/scripts/vault_k8s_auth_configure.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index c7eaf859..a5ccb61d 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -230,7 +230,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
   "crypto/* shared/harbor-pull" ""
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
-write_policy_and_role "maintenance" "maintenance" "ariadne" \
+write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
   "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""

From 0e07ca791ee045ec060fae05b62474ee547b2663 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 19:22:53 -0300
Subject: [PATCH 004/416] feat: wire portal to ariadne

---
 services/bstein-dev-home/backend-deployment.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index 376622c2..f3bca954 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -91,6 +91,10 @@ spec:
               value: atlas
             - name: KEYCLOAK_ADMIN_CLIENT_ID
               value: bstein-dev-home-admin
+            - name: ARIADNE_URL
+              value: http://ariadne.maintenance.svc.cluster.local:8080
+            - name: ARIADNE_TIMEOUT_SEC
+              value: "10"
             - name: ACCOUNT_ALLOWED_GROUPS
               value: ""
             - name: HTTP_CHECK_TIMEOUT_SEC

From 093d86a4e99aad3990bf8fbcfcc415cbf3d75479 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:03:50 -0300
Subject: [PATCH 005/416] chore: add maintenance image automation

---
 .../bstein-dev-home/image-automation.yaml     |  4 +--
 .../flux-system/platform/kustomization.yaml   |  1 +
 .../maintenance/image-automation.yaml         | 26 +++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 clusters/atlas/flux-system/platform/maintenance/image-automation.yaml

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
index 88dda408..643d4792 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@@ -13,14 +13,14 @@ spec:
   git:
     checkout:
       ref:
-        branch: feature/vault-consumption
+        branch: feature/ariadne
     commit:
       author:
         email: ops@bstein.dev
         name: flux-bot
       messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
     push:
-      branch: feature/vault-consumption
+      branch: feature/ariadne
   update:
     strategy: Setters
     path: services/bstein-dev-home
diff --git a/clusters/atlas/flux-system/platform/kustomization.yaml b/clusters/atlas/flux-system/platform/kustomization.yaml
index b689cc04..6e75b040 100644
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@@ -11,6 +11,7 @@ resources:
   - monitoring/kustomization.yaml
   - logging/kustomization.yaml
   - maintenance/kustomization.yaml
+  - maintenance/image-automation.yaml
   - longhorn-adopt/kustomization.yaml
   - longhorn/kustomization.yaml
   - longhorn-ui/kustomization.yaml
diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
new file mode 100644
index 00000000..867cae48
--- /dev/null
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@@ -0,0 +1,26 @@
+# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageUpdateAutomation
+metadata:
+  name: maintenance
+  namespace: flux-system
+spec:
+  interval: 1m0s
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  git:
+    checkout:
+      ref:
+        branch: feature/ariadne
+    commit:
+      author:
+        email: ops@bstein.dev
+        name: flux-bot
+      messageTemplate: "chore(maintenance): update images to {{range .Updated.Images}}{{.}}{{end}}"
+    push:
+      branch: feature/ariadne
+  update:
+    strategy: Setters
+    path: services/maintenance

From 88ed989023c8ed31f7f6d3d33ef2e4cd64b98521 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:11:38 -0300
Subject: [PATCH 006/416] bstein-dev-home: bump images to 0.1.1-107

---
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f9d3c87f..ec137dc6 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,9 +20,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-102 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
+    newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-103 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+    newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From c377294d1b92df8c36794b57328e604b11ed7d87 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:33:34 -0300
Subject: [PATCH 007/416] fix: unblock keycloak and refresh glue checks

---
 ci/tests/glue/config.yaml                 |  9 +++++++++
 ci/tests/glue/test_glue_metrics.py        | 19 +++++++++++++++++++
 services/jenkins/configmap-jcasc.yaml     |  2 +-
 services/keycloak/realm-settings-job.yaml |  2 +-
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/ci/tests/glue/config.yaml b/ci/tests/glue/config.yaml
index 8adf4ca0..16b656c2 100644
--- a/ci/tests/glue/config.yaml
+++ b/ci/tests/glue/config.yaml
@@ -1,7 +1,16 @@
 max_success_age_hours: 48
 allow_suspended:
+  - bstein-dev-home/vaultwarden-cred-sync
   - comms/othrys-room-reset
   - comms/pin-othrys-invite
   - comms/seed-othrys-room
   - finance/firefly-user-sync
+  - health/wger-admin-ensure
   - health/wger-user-sync
+  - mailu-mailserver/mailu-sync-nightly
+  - nextcloud/nextcloud-mail-sync
+ariadne_schedule_tasks:
+  - schedule.mailu_sync
+  - schedule.nextcloud_sync
+  - schedule.vaultwarden_sync
+  - schedule.wger_admin
diff --git a/ci/tests/glue/test_glue_metrics.py b/ci/tests/glue/test_glue_metrics.py
index 16b01c7c..52ec0bef 100644
--- a/ci/tests/glue/test_glue_metrics.py
+++ b/ci/tests/glue/test_glue_metrics.py
@@ -1,11 +1,19 @@
 from __future__ import annotations
 
 import os
+from pathlib import Path
 
 import requests
+import yaml
 
 
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
+CONFIG_PATH = Path(__file__).with_name("config.yaml")
+
+
+def _load_config() -> dict:
+    with CONFIG_PATH.open("r", encoding="utf-8") as handle:
+        return yaml.safe_load(handle) or {}
 
 
 def _query(promql: str) -> list[dict]:
@@ -27,3 +35,14 @@ def test_glue_metrics_success_join():
     )
     series = _query(query)
     assert series, "No glue cronjob last success series found"
+
+
+def test_ariadne_schedule_metrics_present():
+    cfg = _load_config()
+    expected = cfg.get("ariadne_schedule_tasks", [])
+    if not expected:
+        return
+    series = _query("ariadne_schedule_next_run_timestamp_seconds")
+    tasks = {item.get("metric", {}).get("task") for item in series}
+    missing = [task for task in expected if task not in tasks]
+    assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index ac26350e..25dd748d 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -151,7 +151,7 @@ data:
                       url('https://scm.bstein.dev/bstein/titan-iac.git')
                       credentials('gitea-pat')
                     }
-                    branches('*/feature/vault-consumption')
+                    branches('*/main')
                   }
                 }
                 scriptPath('ci/Jenkinsfile.titan-iac')
diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index a0b36ec5..fdee377c 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-32
+  name: keycloak-realm-settings-33
   namespace: sso
 spec:
   backoffLimit: 0

From d25ca49c4949f61629827a3ef3e81de717ea3870 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:35:29 -0300
Subject: [PATCH 008/416] chore: run portal onboarding e2e job

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index f22272e0..201e3f57 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-19
+  name: portal-onboarding-e2e-test-20
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From c508d7ade8e83b17b68a92654add1d1d5c96e41c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:38:22 -0300
Subject: [PATCH 009/416] fix: point portal at ariadne service

---
 services/bstein-dev-home/backend-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index f3bca954..074a19d0 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -92,7 +92,7 @@ spec:
             - name: KEYCLOAK_ADMIN_CLIENT_ID
               value: bstein-dev-home-admin
             - name: ARIADNE_URL
-              value: http://ariadne.maintenance.svc.cluster.local:8080
+              value: http://ariadne.maintenance.svc.cluster.local
             - name: ARIADNE_TIMEOUT_SEC
               value: "10"
             - name: ACCOUNT_ALLOWED_GROUPS

From ee5bfea07220491517e375a302d9ba3fd9e13dd7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:42:14 -0300
Subject: [PATCH 010/416] chore: rerun portal onboarding e2e

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index 201e3f57..0b650903 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-20
+  name: portal-onboarding-e2e-test-21
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From 9af9f28060c4e758d08287169c311a2a1ea94f24 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 22:49:23 -0300
Subject: [PATCH 011/416] fix: extend mailu mailbox wait for ariadne

---
 services/maintenance/ariadne-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index ee4884da..0543f80f 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -93,7 +93,7 @@ spec:
             - name: MAILU_SYNC_URL
               value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
             - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC
-              value: "60"
+              value: "180"
             - name: MAILU_DB_HOST
               value: postgres-service.postgres.svc.cluster.local
             - name: MAILU_DB_PORT

From 60973d3f3c1a9ea50894bf031e95c71320973197 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:04:59 -0300
Subject: [PATCH 012/416] chore(maintenance): bump ariadne image tag

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 5e199a98..e09f6a84 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-1 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From 179023a1cccb238f0ee342a80597f7348276cb7e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:05:46 -0300
Subject: [PATCH 013/416] chore(portal): rerun onboarding e2e

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index 0b650903..c9c1c044 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-21
+  name: portal-onboarding-e2e-test-22
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From 564af1c1d436224ec4e9ee7840030c6678ab4014 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:28:07 -0300
Subject: [PATCH 014/416] fix(mailu): allow forced sync

---
 services/mailu/mailu-sync-listener.yaml       |  2 +-
 services/mailu/scripts/mailu_sync_listener.py | 10 ++++++----
 services/maintenance/kustomization.yaml       |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml
index cc98107f..b3d2acce 100644
--- a/services/mailu/mailu-sync-listener.yaml
+++ b/services/mailu/mailu-sync-listener.yaml
@@ -30,7 +30,7 @@ spec:
         app: mailu-sync-listener
       annotations:
         vault.hashicorp.com/agent-inject: "true"
-        atlas.bstein.dev/mailu-sync-rev: "2"
+        atlas.bstein.dev/mailu-sync-rev: "3"
         vault.hashicorp.com/role: "mailu-mailserver"
         vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret"
         vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: |
diff --git a/services/mailu/scripts/mailu_sync_listener.py b/services/mailu/scripts/mailu_sync_listener.py
index 6ac0da7c..4e31c811 100644
--- a/services/mailu/scripts/mailu_sync_listener.py
+++ b/services/mailu/scripts/mailu_sync_listener.py
@@ -39,12 +39,12 @@ def _run_sync_blocking() -> int:
             sync_done.set()
 
 
-def _trigger_sync_async() -> bool:
+def _trigger_sync_async(force: bool = False) -> bool:
     with lock:
         now = time()
         if sync_running:
             return False
-        if now - last_run < MIN_INTERVAL_SECONDS:
+        if not force and now - last_run < MIN_INTERVAL_SECONDS:
             return False
 
     thread = threading.Thread(target=_run_sync_blocking, daemon=True)
@@ -64,15 +64,17 @@ class Handler(http.server.BaseHTTPRequestHandler):
             return
 
         wait = False
+        force = False
         if isinstance(payload, dict):
             wait = bool(payload.get("wait"))
+            force = bool(payload.get("force"))
 
         if wait:
             with lock:
                 already_running = sync_running
 
             if not already_running:
-                _trigger_sync_async()
+                _trigger_sync_async(force=force)
 
             sync_done.wait(timeout=WAIT_TIMEOUT_SECONDS)
             with lock:
@@ -87,7 +89,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
             self.end_headers()
             return
 
-        _trigger_sync_async()
+        _trigger_sync_async(force=force)
         self.send_response(202)
         self.end_headers()
 
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index e09f6a84..9255d889 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-1 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-2 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From 49292f9d8ba3a473f284de34015f4e7fcbe7b2f2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:31:45 -0300
Subject: [PATCH 015/416] chore(portal): rerun onboarding e2e

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index c9c1c044..9dbe68df 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-22
+  name: portal-onboarding-e2e-test-23
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From d07415e6230bd1b6fe0be53de8215a5f17cf114d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:45:31 -0300
Subject: [PATCH 016/416] core: fix postmark DNS and time sync

---
 infrastructure/core/coredns-custom.yaml       |  3 ++
 infrastructure/core/kustomization.yaml        |  1 +
 infrastructure/core/ntp-sync-daemonset.yaml   | 50 +++++++++++++++++++
 .../postmark-exporter-deployment.yaml         |  4 +-
 4 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 infrastructure/core/ntp-sync-daemonset.yaml

diff --git a/infrastructure/core/coredns-custom.yaml b/infrastructure/core/coredns-custom.yaml
index 8aeff149..6266a22a 100644
--- a/infrastructure/core/coredns-custom.yaml
+++ b/infrastructure/core/coredns-custom.yaml
@@ -32,6 +32,9 @@ data:
         192.168.22.9 notes.bstein.dev
         192.168.22.9 office.bstein.dev
         192.168.22.9 pegasus.bstein.dev
+        3.136.224.193 pm-bounces.bstein.dev
+        3.150.68.49 pm-bounces.bstein.dev
+        18.189.137.81 pm-bounces.bstein.dev
         192.168.22.9 registry.bstein.dev
         192.168.22.9 scm.bstein.dev
         192.168.22.9 secret.bstein.dev
diff --git a/infrastructure/core/kustomization.yaml b/infrastructure/core/kustomization.yaml
index 6286186d..257e1f06 100644
--- a/infrastructure/core/kustomization.yaml
+++ b/infrastructure/core/kustomization.yaml
@@ -6,5 +6,6 @@ resources:
   - ../modules/profiles/atlas-ha
   - coredns-custom.yaml
   - coredns-deployment.yaml
+  - ntp-sync-daemonset.yaml
   - ../sources/cert-manager/letsencrypt.yaml
   - ../sources/cert-manager/letsencrypt-prod.yaml
diff --git a/infrastructure/core/ntp-sync-daemonset.yaml b/infrastructure/core/ntp-sync-daemonset.yaml
new file mode 100644
index 00000000..ba972949
--- /dev/null
+++ b/infrastructure/core/ntp-sync-daemonset.yaml
@@ -0,0 +1,50 @@
+# infrastructure/core/ntp-sync-daemonset.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: ntp-sync
+  namespace: kube-system
+  labels:
+    app: ntp-sync
+spec:
+  selector:
+    matchLabels:
+      app: ntp-sync
+  template:
+    metadata:
+      labels:
+        app: ntp-sync
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/control-plane
+                    operator: DoesNotExist
+                  - key: node-role.kubernetes.io/master
+                    operator: DoesNotExist
+      containers:
+        - name: ntp-sync
+          image: public.ecr.aws/docker/library/busybox:1.36.1
+          imagePullPolicy: IfNotPresent
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -eu
+              while true; do
+                ntpd -q -p pool.ntp.org || true
+                sleep 300
+              done
+          securityContext:
+            capabilities:
+              add: ["SYS_TIME"]
+            runAsUser: 0
+            runAsGroup: 0
+          resources:
+            requests:
+              cpu: 10m
+              memory: 16Mi
+            limits:
+              cpu: 50m
+              memory: 64Mi
diff --git a/services/monitoring/postmark-exporter-deployment.yaml b/services/monitoring/postmark-exporter-deployment.yaml
index 64062248..98791d95 100644
--- a/services/monitoring/postmark-exporter-deployment.yaml
+++ b/services/monitoring/postmark-exporter-deployment.yaml
@@ -18,9 +18,9 @@ spec:
         prometheus.io/path: "/metrics"
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "monitoring"
-        vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/monitoring/postmark-exporter"
+        vault.hashicorp.com/agent-inject-secret-postmark-env: "kv/data/atlas/shared/postmark-relay"
         vault.hashicorp.com/agent-inject-template-postmark-env: |
-          {{- with secret "kv/data/atlas/monitoring/postmark-exporter" -}}
+          {{- with secret "kv/data/atlas/shared/postmark-relay" -}}
           export POSTMARK_SERVER_TOKEN="{{ index .Data.data "apikey" }}"
           export POSTMARK_SERVER_TOKEN_FALLBACK="{{ index .Data.data "apikey" }}"
           {{- if index .Data.data "sending-limit" }}

From e9597660f9cf66a37bc8718db321ff77524e6b32 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:45:48 -0300
Subject: [PATCH 017/416] chore(maintenance): bump ariadne image tag

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 9255d889..35af46f3 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-2 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-3 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From d406a12b4ab0e0985244053b0b1108c1d2cc757e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:47:24 -0300
Subject: [PATCH 018/416] chore(portal): rerun onboarding e2e

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index 9dbe68df..535b1dc2 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-23
+  name: portal-onboarding-e2e-test-24
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From 5eae50ca4ccac7ec4fe3c4f8dab7012f7d8f7329 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:51:55 -0300
Subject: [PATCH 019/416] fix(mailu): pin sync workloads to arm64

---
 services/mailu/mailu-sync-cronjob.yaml  | 3 +++
 services/mailu/mailu-sync-listener.yaml | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/services/mailu/mailu-sync-cronjob.yaml b/services/mailu/mailu-sync-cronjob.yaml
index 671439d5..bbe9909e 100644
--- a/services/mailu/mailu-sync-cronjob.yaml
+++ b/services/mailu/mailu-sync-cronjob.yaml
@@ -38,6 +38,9 @@ spec:
               {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}}
         spec:
           restartPolicy: OnFailure
+          nodeSelector:
+            kubernetes.io/arch: arm64
+            node-role.kubernetes.io/worker: "true"
           serviceAccountName: mailu-vault-sync
           containers:
             - name: mailu-sync
diff --git a/services/mailu/mailu-sync-listener.yaml b/services/mailu/mailu-sync-listener.yaml
index b3d2acce..0644c5bb 100644
--- a/services/mailu/mailu-sync-listener.yaml
+++ b/services/mailu/mailu-sync-listener.yaml
@@ -30,7 +30,7 @@ spec:
         app: mailu-sync-listener
       annotations:
         vault.hashicorp.com/agent-inject: "true"
-        atlas.bstein.dev/mailu-sync-rev: "3"
+        atlas.bstein.dev/mailu-sync-rev: "4"
         vault.hashicorp.com/role: "mailu-mailserver"
         vault.hashicorp.com/agent-inject-secret-mailu-db-secret__database: "kv/data/atlas/mailu/mailu-db-secret"
         vault.hashicorp.com/agent-inject-template-mailu-db-secret__database: |
@@ -52,6 +52,9 @@ spec:
           {{- with secret "kv/data/atlas/mailu/mailu-initial-account-secret" -}}{{ .Data.data.password }}{{- end -}}
     spec:
       restartPolicy: Always
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
       serviceAccountName: mailu-vault-sync
       containers:
         - name: listener

From ba6b97b92a8d39a019895a239fd8e011dc71cb0e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 19 Jan 2026 23:58:37 -0300
Subject: [PATCH 020/416] chore(portal): rerun onboarding e2e

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index 535b1dc2..505e1817 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-24
+  name: portal-onboarding-e2e-test-25
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From 6b5a77b32e17a6c938684d45a09f0fa86eeab98a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 00:07:45 -0300
Subject: [PATCH 021/416] chore(maintenance): bump ariadne image tag

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 35af46f3..80c61dfe 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-3 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-4 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From 3995b28aa3cfd45a7cfea88843aff8af3e7b111b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 00:09:49 -0300
Subject: [PATCH 022/416] chore(portal): rerun onboarding e2e

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index 505e1817..a0b6569e 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-25
+  name: portal-onboarding-e2e-test-26
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From cacb03b42f43bd764ae37cef50fc418b21910790 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 00:58:04 -0300
Subject: [PATCH 023/416] mailu: use postmark server token for relay

---
 services/mailu/helmrelease.yaml | 60 ++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 7342141a..9779aed8 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -335,8 +335,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        export RELAYUSER="{{ index .Data.data "apikey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- $apikey := index .Data.data "apikey" -}}
+                        {{- if $apikey }}
+                        export RELAYUSER="{{ $apikey }}"
+                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- else }}
+                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -397,8 +403,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        export RELAYUSER="{{ index .Data.data "apikey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- $apikey := index .Data.data "apikey" -}}
+                        {{- if $apikey }}
+                        export RELAYUSER="{{ $apikey }}"
+                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- else }}
+                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -459,8 +471,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        export RELAYUSER="{{ index .Data.data "apikey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- $apikey := index .Data.data "apikey" -}}
+                        {{- if $apikey }}
+                        export RELAYUSER="{{ $apikey }}"
+                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- else }}
+                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -521,8 +539,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        export RELAYUSER="{{ index .Data.data "apikey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- $apikey := index .Data.data "apikey" -}}
+                        {{- if $apikey }}
+                        export RELAYUSER="{{ $apikey }}"
+                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- else }}
+                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -583,8 +607,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        export RELAYUSER="{{ index .Data.data "apikey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- $apikey := index .Data.data "apikey" -}}
+                        {{- if $apikey }}
+                        export RELAYUSER="{{ $apikey }}"
+                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- else }}
+                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync
@@ -645,8 +675,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        export RELAYUSER="{{ index .Data.data "apikey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
+                        {{- $apikey := index .Data.data "apikey" -}}
+                        {{- if $apikey }}
+                        export RELAYUSER="{{ $apikey }}"
+                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- else }}
+                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        {{- end }}
                         {{ end }}
                   spec:
                     serviceAccountName: mailu-vault-sync

From 6157ebd98b973d2ddb929f6aa4f78588027558c5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 01:04:04 -0300
Subject: [PATCH 024/416] mailu: prefer postmark smtp token for relay

---
 services/mailu/helmrelease.yaml | 78 ++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 9779aed8..4621a2d8 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -335,13 +335,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        {{- $apikey := index .Data.data "apikey" -}}
-                        {{- if $apikey }}
-                        export RELAYUSER="{{ $apikey }}"
-                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
                         {{- else }}
-                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        export RELAYUSER="{{ index .Data.data "apikey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
                         {{- end }}
                         {{ end }}
                   spec:
@@ -403,13 +404,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        {{- $apikey := index .Data.data "apikey" -}}
-                        {{- if $apikey }}
-                        export RELAYUSER="{{ $apikey }}"
-                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
                         {{- else }}
-                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        export RELAYUSER="{{ index .Data.data "apikey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
                         {{- end }}
                         {{ end }}
                   spec:
@@ -471,13 +473,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        {{- $apikey := index .Data.data "apikey" -}}
-                        {{- if $apikey }}
-                        export RELAYUSER="{{ $apikey }}"
-                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
                         {{- else }}
-                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        export RELAYUSER="{{ index .Data.data "apikey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
                         {{- end }}
                         {{ end }}
                   spec:
@@ -539,13 +542,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        {{- $apikey := index .Data.data "apikey" -}}
-                        {{- if $apikey }}
-                        export RELAYUSER="{{ $apikey }}"
-                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
                         {{- else }}
-                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        export RELAYUSER="{{ index .Data.data "apikey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
                         {{- end }}
                         {{ end }}
                   spec:
@@ -607,13 +611,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        {{- $apikey := index .Data.data "apikey" -}}
-                        {{- if $apikey }}
-                        export RELAYUSER="{{ $apikey }}"
-                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
                         {{- else }}
-                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        export RELAYUSER="{{ index .Data.data "apikey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
                         {{- end }}
                         {{ end }}
                   spec:
@@ -675,13 +680,14 @@ spec:
                         export INITIAL_ADMIN_PW="{{ .Data.data.password }}"
                         {{ end }}
                         {{ with secret "kv/data/atlas/shared/postmark-relay" }}
-                        {{- $apikey := index .Data.data "apikey" -}}
-                        {{- if $apikey }}
-                        export RELAYUSER="{{ $apikey }}"
-                        export RELAYPASSWORD="{{ $apikey }}"
+                        {{- $access := index .Data.data "accesskey" -}}
+                        {{- $secret := index .Data.data "secretkey" -}}
+                        {{- if and $access $secret }}
+                        export RELAYUSER="{{ $access }}"
+                        export RELAYPASSWORD="{{ $secret }}"
                         {{- else }}
-                        export RELAYUSER="{{ index .Data.data "accesskey" }}"
-                        export RELAYPASSWORD="{{ index .Data.data "secretkey" }}"
+                        export RELAYUSER="{{ index .Data.data "apikey" }}"
+                        export RELAYPASSWORD="{{ index .Data.data "apikey" }}"
                         {{- end }}
                         {{ end }}
                   spec:

From 8c77d1569de5f50f0730ff0247f0c420446a2f5e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 01:05:06 -0300
Subject: [PATCH 025/416] ci: pin quality gate agents to rpi5

---
 ci/Jenkinsfile.titan-iac | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac
index 3b13eb08..359dc94f 100644
--- a/ci/Jenkinsfile.titan-iac
+++ b/ci/Jenkinsfile.titan-iac
@@ -6,6 +6,10 @@ pipeline {
 apiVersion: v1
 kind: Pod
 spec:
+  nodeSelector:
+    hardware: rpi5
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: "true"
   containers:
     - name: python
       image: python:3.12-slim

From 4285c378a8368059c9c2222db0823347a7cabf46 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 01:07:01 -0300
Subject: [PATCH 026/416] mailu: recreate postfix on upgrade

---
 services/mailu/helmrelease.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 4621a2d8..e84b3760 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -455,6 +455,8 @@ spec:
               metadata:
                 name: mailu-postfix
               spec:
+                strategy:
+                  type: Recreate
                 template:
                   metadata:
                     annotations:

From e30afabdf05143059bf9e32911fe45955dbb52b7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 01:08:51 -0300
Subject: [PATCH 027/416] jenkins: re-target quality gate and restart

---
 services/jenkins/configmap-jcasc.yaml | 2 +-
 services/jenkins/deployment.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 25dd748d..0a25aa17 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -75,7 +75,7 @@ data:
                       url('https://scm.bstein.dev/bstein/titan-iac.git')
                       credentials('gitea-pat')
                     }
-                    branches('*/main')
+                    branches('*/feature/ariadne')
                   }
                 }
                 scriptPath('services/jellyfin/oidc/Jenkinsfile')
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index e846a8ef..0e99caba 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -38,7 +38,7 @@ spec:
           GITEA_PAT_USERNAME={{ .Data.data.username }}
           GITEA_PAT_TOKEN={{ .Data.data.token }}
           {{- end -}}
-        bstein.dev/restarted-at: "2026-01-19T00:25:00Z"
+        bstein.dev/restarted-at: "2026-01-20T04:08:33Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From 7d44006423c8956de297624a5cc86272dad17b91 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 01:14:30 -0300
Subject: [PATCH 028/416] jenkins: align quality gate branch

---
 services/jenkins/configmap-jcasc.yaml | 4 ++--
 services/jenkins/deployment.yaml      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 0a25aa17..fcd01f90 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -75,7 +75,7 @@ data:
                       url('https://scm.bstein.dev/bstein/titan-iac.git')
                       credentials('gitea-pat')
                     }
-                    branches('*/feature/ariadne')
+                    branches('*/main')
                   }
                 }
                 scriptPath('services/jellyfin/oidc/Jenkinsfile')
@@ -151,7 +151,7 @@ data:
                       url('https://scm.bstein.dev/bstein/titan-iac.git')
                       credentials('gitea-pat')
                     }
-                    branches('*/main')
+                    branches('*/feature/ariadne')
                   }
                 }
                 scriptPath('ci/Jenkinsfile.titan-iac')
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 0e99caba..dfbe5feb 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -38,7 +38,7 @@ spec:
           GITEA_PAT_USERNAME={{ .Data.data.username }}
           GITEA_PAT_TOKEN={{ .Data.data.token }}
           {{- end -}}
-        bstein.dev/restarted-at: "2026-01-20T04:08:33Z"
+        bstein.dev/restarted-at: "2026-01-20T04:14:13Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From 29d0a376da0bb56b24ca1aac352b080a7087fe77 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 01:20:16 -0300
Subject: [PATCH 029/416] portal: rerun onboarding e2e job

---
 services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
index a0b6569e..681e89d2 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: portal-onboarding-e2e-test-26
+  name: portal-onboarding-e2e-test-27
   namespace: bstein-dev-home
 spec:
   backoffLimit: 0

From 6911e99e32badc4029be21eb2626b3907c8f9beb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 02:30:44 -0300
Subject: [PATCH 030/416] mailu: rewrite double-bounce to base domain

---
 services/mailu/helmrelease.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index e84b3760..7eab19ac 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -219,6 +219,8 @@ spec:
       overrides:
         postfix.cf: |
           mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24
+          recipient_canonical_maps = regexp:/overrides/recipient_canonical, ${podop}recipientmap
+          recipient_canonical_classes = envelope_recipient,header_recipient
           smtpd_delay_reject = yes
           smtpd_helo_required = yes
           smtpd_helo_restrictions = reject_invalid_helo_hostname, reject_non_fqdn_helo_hostname, reject_unknown_helo_hostname
@@ -238,6 +240,8 @@ spec:
           smtpd_client_message_rate_limit = 100
           smtpd_client_recipient_rate_limit = 200
           smtpd_recipient_limit = 100
+        recipient_canonical: |
+          /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev
       podAnnotations:
         bstein.dev/restarted-at: "2026-01-06T00:00:00Z"
     redis:

From 980daa683bdba94a0242b7e80c5757e37f65c1db Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 02:32:43 -0300
Subject: [PATCH 031/416] mailu: restart postfix to load canonical map

---
 services/mailu/helmrelease.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 7eab19ac..599faf13 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -243,7 +243,7 @@ spec:
         recipient_canonical: |
           /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev
       podAnnotations:
-        bstein.dev/restarted-at: "2026-01-06T00:00:00Z"
+        bstein.dev/restarted-at: "2026-01-20T04:20:00Z"
     redis:
       enabled: true
       architecture: standalone

From 2cbecde47831dd4b8aa2f48125dafb94d1cdb548 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 02:37:02 -0300
Subject: [PATCH 032/416] mailu: keep podop socketmap in canonical maps

---
 services/mailu/helmrelease.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 599faf13..9d8519bc 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -219,7 +219,7 @@ spec:
       overrides:
         postfix.cf: |
           mynetworks = 127.0.0.0/8 [::1]/128 10.42.0.0/16 10.43.0.0/16 192.168.22.0/24
-          recipient_canonical_maps = regexp:/overrides/recipient_canonical, ${podop}recipientmap
+          recipient_canonical_maps = regexp:/overrides/recipient_canonical, socketmap:unix:/tmp/podop.socket:recipientmap
           recipient_canonical_classes = envelope_recipient,header_recipient
           smtpd_delay_reject = yes
           smtpd_helo_required = yes

From 55c9993d084ff9fdb1a2bc78cd8d0ed7a6178a1d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 02:38:04 -0300
Subject: [PATCH 033/416] mailu: restart postfix after canonical map update

---
 services/mailu/helmrelease.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/mailu/helmrelease.yaml b/services/mailu/helmrelease.yaml
index 9d8519bc..2a7e6f5f 100644
--- a/services/mailu/helmrelease.yaml
+++ b/services/mailu/helmrelease.yaml
@@ -243,7 +243,7 @@ spec:
         recipient_canonical: |
           /^double-bounce@mail\.bstein\.dev$/ double-bounce@bstein.dev
       podAnnotations:
-        bstein.dev/restarted-at: "2026-01-20T04:20:00Z"
+        bstein.dev/restarted-at: "2026-01-20T04:35:00Z"
     redis:
       enabled: true
       architecture: standalone

From f8c368b21f124f0b3e75921a86810367f78add84 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 03:01:59 -0300
Subject: [PATCH 034/416] maintenance: extend Ariadne schedules and RBAC

---
 services/comms/guest-name-job.yaml           |  2 +-
 services/maintenance/ariadne-deployment.yaml | 34 ++++++++++++++++++++
 services/maintenance/ariadne-rbac.yaml       | 29 +++++++++++++++++
 services/maintenance/kustomization.yaml      |  1 +
 services/vault/k8s-auth-config-cronjob.yaml  |  1 +
 services/vault/oidc-config-cronjob.yaml      |  1 +
 6 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 services/maintenance/ariadne-rbac.yaml

diff --git a/services/comms/guest-name-job.yaml b/services/comms/guest-name-job.yaml
index 21a8af5f..3eae2dd2 100644
--- a/services/comms/guest-name-job.yaml
+++ b/services/comms/guest-name-job.yaml
@@ -8,7 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/1 * * * *"
-  suspend: false
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 1
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 0543f80f..cd0d38c7 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -78,6 +78,8 @@ spec:
               value: bstein-dev-home-admin
             - name: PORTAL_PUBLIC_BASE_URL
               value: https://bstein.dev
+            - name: ARIADNE_LOG_LEVEL
+              value: INFO
             - name: PORTAL_ADMIN_USERS
               value: bstein
             - name: PORTAL_ADMIN_GROUPS
@@ -120,6 +122,26 @@ spec:
               value: firefly-user-sync
             - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC
               value: "90"
+            - name: VAULT_NAMESPACE
+              value: vault
+            - name: VAULT_K8S_AUTH_CRONJOB
+              value: vault-k8s-auth-config
+            - name: VAULT_OIDC_CRONJOB
+              value: vault-oidc-config
+            - name: VAULT_JOB_WAIT_TIMEOUT_SEC
+              value: "120"
+            - name: COMMS_NAMESPACE
+              value: comms
+            - name: COMMS_GUEST_NAME_CRONJOB
+              value: guest-name-randomizer
+            - name: COMMS_PIN_INVITE_CRONJOB
+              value: pin-othrys-invite
+            - name: COMMS_RESET_ROOM_CRONJOB
+              value: othrys-room-reset
+            - name: COMMS_SEED_ROOM_CRONJOB
+              value: seed-othrys-room
+            - name: COMMS_JOB_WAIT_TIMEOUT_SEC
+              value: "60"
             - name: VAULTWARDEN_NAMESPACE
               value: vaultwarden
             - name: VAULTWARDEN_POD_LABEL
@@ -154,6 +176,18 @@ spec:
               value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_WGER_ADMIN
               value: "15 3 * * *"
+            - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
+              value: "*/15 * * * *"
+            - name: ARIADNE_SCHEDULE_VAULT_OIDC
+              value: "*/15 * * * *"
+            - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
+              value: "*/1 * * * *"
+            - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
+              value: "*/30 * * * *"
+            - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM
+              value: "0 0 1 1 *"
+            - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
+              value: "*/10 * * * *"
             - name: WELCOME_EMAIL_ENABLED
               value: "true"
             - name: K8S_API_TIMEOUT_SEC
diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml
new file mode 100644
index 00000000..8d2a2a9a
--- /dev/null
+++ b/services/maintenance/ariadne-rbac.yaml
@@ -0,0 +1,29 @@
+# services/maintenance/ariadne-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: ariadne-job-spawner
+rules:
+  - apiGroups: ["batch"]
+    resources:
+      - cronjobs
+      - jobs
+    verbs:
+      - get
+      - list
+      - watch
+      - create
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: ariadne-job-spawner
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: ariadne-job-spawner
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 80c61dfe..0810f5e7 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -8,6 +8,7 @@ resources:
   - vault-serviceaccount.yaml
   - vault-sync-deployment.yaml
   - ariadne-serviceaccount.yaml
+  - ariadne-rbac.yaml
   - disable-k3s-traefik-serviceaccount.yaml
   - k3s-traefik-cleanup-rbac.yaml
   - node-nofile-serviceaccount.yaml
diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml
index 29e8e809..e7cca14e 100644
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/vault/oidc-config-cronjob.yaml b/services/vault/oidc-config-cronjob.yaml
index 013c9f32..4d317c55 100644
--- a/services/vault/oidc-config-cronjob.yaml
+++ b/services/vault/oidc-config-cronjob.yaml
@@ -8,6 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3

From cf20c27cedd4fdf793faad7fdf83756a3c5f6540 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 03:21:36 -0300
Subject: [PATCH 035/416] ci(jenkins): add multibranch quality gate

---
 ci/Jenkinsfile.titan-iac                | 23 +++++++++++++++--
 services/jenkins/configmap-jcasc.yaml   | 33 ++++++++++++++++---------
 services/jenkins/configmap-plugins.yaml |  1 +
 services/maintenance/kustomization.yaml |  2 +-
 4 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/ci/Jenkinsfile.titan-iac b/ci/Jenkinsfile.titan-iac
index 359dc94f..77990d77 100644
--- a/ci/Jenkinsfile.titan-iac
+++ b/ci/Jenkinsfile.titan-iac
@@ -22,7 +22,6 @@ spec:
   environment {
     PIP_DISABLE_PIP_VERSION_CHECK = '1'
     PYTHONUNBUFFERED = '1'
-    DEPLOY_BRANCH = 'deploy'
   }
   stages {
     stage('Checkout') {
@@ -40,7 +39,27 @@ spec:
         sh 'pytest -q ci/tests/glue'
       }
     }
+    stage('Resolve Flux branch') {
+      steps {
+        script {
+          env.FLUX_BRANCH = sh(
+            returnStdout: true,
+            script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
+          ).trim()
+          if (!env.FLUX_BRANCH) {
+            error('Flux branch not found in gotk-sync.yaml')
+          }
+          echo "Flux branch: ${env.FLUX_BRANCH}"
+        }
+      }
+    }
     stage('Promote') {
+      when {
+        expression {
+          def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
+          return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
+        }
+      }
       steps {
         withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
           sh '''
@@ -48,7 +67,7 @@ spec:
             git config user.email "jenkins@bstein.dev"
             git config user.name "jenkins"
             git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
-            git push origin HEAD:${DEPLOY_BRANCH}
+            git push origin HEAD:${FLUX_BRANCH}
           '''
         }
       }
diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index fcd01f90..62012f1c 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -139,24 +139,33 @@ data:
               }
             }
           }
-          pipelineJob('titan-iac-quality-gate') {
-            triggers {
-              scm('H/5 * * * *')
-            }
-            definition {
-              cpsScm {
-                scm {
+          multibranchPipelineJob('titan-iac-quality-gate') {
+            branchSources {
+              branchSource {
+                source {
                   git {
-                    remote {
-                      url('https://scm.bstein.dev/bstein/titan-iac.git')
-                      credentials('gitea-pat')
-                    }
-                    branches('*/feature/ariadne')
+                    id('titan-iac-quality-gate')
+                    remote('https://scm.bstein.dev/bstein/titan-iac.git')
+                    credentialsId('gitea-pat')
                   }
                 }
+              }
+            }
+            factory {
+              workflowBranchProjectFactory {
                 scriptPath('ci/Jenkinsfile.titan-iac')
               }
             }
+            orphanedItemStrategy {
+              discardOldItems {
+                numToKeep(30)
+              }
+            }
+            triggers {
+              periodicFolderTrigger {
+                interval('12h')
+              }
+            }
           }
   base.yaml: |
     jenkins:
diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml
index eabea13b..108c6461 100644
--- a/services/jenkins/configmap-plugins.yaml
+++ b/services/jenkins/configmap-plugins.yaml
@@ -9,6 +9,7 @@ data:
     kubernetes
     workflow-aggregator
     git
+    git-branch-source
     pipeline-utility-steps
     configuration-as-code
     configuration-as-code-support
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 0810f5e7..b7fe46b5 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-4 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-5 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From 652ab18e82989075af046fd7c4832ad10fc800b9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 03:30:48 -0300
Subject: [PATCH 036/416] ci(jenkins): add Ariadne pipeline job

---
 services/jenkins/configmap-jcasc.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 62012f1c..78d98fea 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -120,6 +120,25 @@ data:
               }
             }
           }
+          pipelineJob('ariadne') {
+            triggers {
+              scm('H/2 * * * *')
+            }
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/ariadne.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/master')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
           pipelineJob('data-prepper') {
             triggers {
               scm('H/5 * * * *')

From 5690376b72af6dd6c816c5ee57e792457680078c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 03:59:19 -0300
Subject: [PATCH 037/416] glue: preserve keycloak profile updates

---
 services/mailu/scripts/mailu_sync.py    | 32 ++++++++++++++++++++++++-
 services/maintenance/kustomization.yaml |  2 +-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/services/mailu/scripts/mailu_sync.py b/services/mailu/scripts/mailu_sync.py
index 001917ab..71b0f5a2 100644
--- a/services/mailu/scripts/mailu_sync.py
+++ b/services/mailu/scripts/mailu_sync.py
@@ -130,7 +130,9 @@ def kc_update_attributes(token, user, attributes):
     if not isinstance(current_attrs, dict):
         current_attrs = {}
     current_attrs.update(attributes)
-    resp = SESSION.put(user_url, headers=headers, json={"attributes": current_attrs}, timeout=20)
+    payload = _safe_update_payload(current_payload)
+    payload["attributes"] = current_attrs
+    resp = SESSION.put(user_url, headers=headers, json=payload, timeout=20)
     resp.raise_for_status()
     verify = SESSION.get(
         user_url,
@@ -144,6 +146,34 @@ def kc_update_attributes(token, user, attributes):
         raise Exception(f"attribute not persisted for {user.get('email') or user['username']}")
 
 
+def _safe_update_payload(user_payload: dict) -> dict:
+    payload: dict = {}
+    username = user_payload.get("username")
+    if isinstance(username, str):
+        payload["username"] = username
+    enabled = user_payload.get("enabled")
+    if isinstance(enabled, bool):
+        payload["enabled"] = enabled
+    email = user_payload.get("email")
+    if isinstance(email, str):
+        payload["email"] = email
+    email_verified = user_payload.get("emailVerified")
+    if isinstance(email_verified, bool):
+        payload["emailVerified"] = email_verified
+    first_name = user_payload.get("firstName")
+    if isinstance(first_name, str):
+        payload["firstName"] = first_name
+    last_name = user_payload.get("lastName")
+    if isinstance(last_name, str):
+        payload["lastName"] = last_name
+    actions = user_payload.get("requiredActions")
+    if isinstance(actions, list):
+        payload["requiredActions"] = [a for a in actions if isinstance(a, str)]
+    attrs = user_payload.get("attributes")
+    payload["attributes"] = attrs if isinstance(attrs, dict) else {}
+    return payload
+
+
 def random_password():
     alphabet = string.ascii_letters + string.digits
     return "".join(secrets.choice(alphabet) for _ in range(24))
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index b7fe46b5..a86453e1 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-5 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-6 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From f0855b7a3f9f3abefae9dc18e0d1ef584202dcfd Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 09:06:39 -0300
Subject: [PATCH 038/416] gitea: allow jenkins webhook

---
 services/gitea/deployment.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/gitea/deployment.yaml b/services/gitea/deployment.yaml
index 9dc0c878..da188c35 100644
--- a/services/gitea/deployment.yaml
+++ b/services/gitea/deployment.yaml
@@ -169,6 +169,8 @@ spec:
               value: "trace"
             - name: GITEA__service__REQUIRE_SIGNIN_VIEW
               value: "false"
+            - name: GITEA__webhook__ALLOWED_HOST_LIST
+              value: "ci.bstein.dev"
             - name: GITEA__server__PROXY_HEADERS
               value: "X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Host"
             - name: GITEA__session__COOKIE_SECURE

From d21dc989f69729e869da9002ccb3113577c8423b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 09:37:21 -0300
Subject: [PATCH 039/416] jenkins: pin root url for OIDC

---
 services/jenkins/configmap-jcasc.yaml | 5 ++++-
 services/jenkins/deployment.yaml      | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 78d98fea..d4a29f1e 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -18,7 +18,7 @@ data:
           logoutFromOpenIdProvider: true
           postLogoutRedirectUrl: "https://ci.bstein.dev"
           sendScopesInTokenRequest: true
-          rootURLFromRequest: true
+          rootURLFromRequest: false
           userNameField: "preferred_username"
           fullNameFieldName: "name"
           emailFieldName: "email"
@@ -245,3 +245,6 @@ data:
       crumbIssuer:
         standard:
           excludeClientIPFromCrumb: true
+    unclassified:
+      location:
+        url: "https://ci.bstein.dev/"
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index dfbe5feb..fdb8d107 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -38,7 +38,7 @@ spec:
           GITEA_PAT_USERNAME={{ .Data.data.username }}
           GITEA_PAT_TOKEN={{ .Data.data.token }}
           {{- end -}}
-        bstein.dev/restarted-at: "2026-01-20T04:14:13Z"
+        bstein.dev/restarted-at: "2026-01-20T05:05:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From eb23881f6496fd81ff3a5900b53ae2333a77ee9e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 09:45:33 -0300
Subject: [PATCH 040/416] jenkins: drop removed multibranch plugin

---
 services/jenkins/configmap-jcasc.yaml   | 33 +++++++++----------------
 services/jenkins/configmap-plugins.yaml |  2 --
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index d4a29f1e..9e116c0e 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -158,33 +158,24 @@ data:
               }
             }
           }
-          multibranchPipelineJob('titan-iac-quality-gate') {
-            branchSources {
-              branchSource {
-                source {
+          pipelineJob('titan-iac-quality-gate') {
+            triggers {
+              scm('H/12 * * * *')
+            }
+            definition {
+              cpsScm {
+                scm {
                   git {
-                    id('titan-iac-quality-gate')
-                    remote('https://scm.bstein.dev/bstein/titan-iac.git')
-                    credentialsId('gitea-pat')
+                    remote {
+                      url('https://scm.bstein.dev/bstein/titan-iac.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/main')
                   }
                 }
-              }
-            }
-            factory {
-              workflowBranchProjectFactory {
                 scriptPath('ci/Jenkinsfile.titan-iac')
               }
             }
-            orphanedItemStrategy {
-              discardOldItems {
-                numToKeep(30)
-              }
-            }
-            triggers {
-              periodicFolderTrigger {
-                interval('12h')
-              }
-            }
           }
   base.yaml: |
     jenkins:
diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml
index 108c6461..d20a2839 100644
--- a/services/jenkins/configmap-plugins.yaml
+++ b/services/jenkins/configmap-plugins.yaml
@@ -9,10 +9,8 @@ data:
     kubernetes
     workflow-aggregator
     git
-    git-branch-source
     pipeline-utility-steps
     configuration-as-code
-    configuration-as-code-support
     oic-auth
     job-dsl
     simple-theme-plugin

From 2aa4bd1fe18efc9e9c3108ace5e434c403f8217d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 10:15:33 -0300
Subject: [PATCH 041/416] jenkins: restore multibranch + webhook token

---
 services/jenkins/configmap-jcasc.yaml   | 40 +++++++++++++++++--------
 services/jenkins/configmap-plugins.yaml | 22 +++++++++-----
 services/jenkins/deployment.yaml        | 17 ++++++-----
 3 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 9e116c0e..ca3a7228 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -158,24 +158,40 @@ data:
               }
             }
           }
-          pipelineJob('titan-iac-quality-gate') {
-            triggers {
-              scm('H/12 * * * *')
-            }
-            definition {
-              cpsScm {
-                scm {
+          multibranchPipelineJob('titan-iac-quality-gate') {
+            branchSources {
+              branchSource {
+                source {
                   git {
-                    remote {
-                      url('https://scm.bstein.dev/bstein/titan-iac.git')
-                      credentials('gitea-pat')
-                    }
-                    branches('*/main')
+                    id('titan-iac-quality-gate')
+                    remote('https://scm.bstein.dev/bstein/titan-iac.git')
+                    credentialsId('gitea-pat')
                   }
                 }
+              }
+            }
+            factory {
+              workflowBranchProjectFactory {
                 scriptPath('ci/Jenkinsfile.titan-iac')
               }
             }
+            orphanedItemStrategy {
+              discardOldItems {
+                numToKeep(30)
+              }
+            }
+            triggers {
+              periodicFolderTrigger {
+                interval('12h')
+              }
+            }
+            configure { node ->
+              def token = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: ''
+              def triggers = node / 'triggers'
+              triggers << 'com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger' {
+                token(token)
+              }
+            }
           }
   base.yaml: |
     jenkins:
diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml
index d20a2839..35295126 100644
--- a/services/jenkins/configmap-plugins.yaml
+++ b/services/jenkins/configmap-plugins.yaml
@@ -6,11 +6,17 @@ metadata:
   namespace: jenkins
 data:
   plugins.txt: |
-    kubernetes
-    workflow-aggregator
-    git
-    pipeline-utility-steps
-    configuration-as-code
-    oic-auth
-    job-dsl
-    simple-theme-plugin
+    kubernetes:4416.v2ea_b_5372da_a_e
+    workflow-aggregator:608.v67378e9d3db_1
+    git:5.8.1
+    pipeline-utility-steps:2.20.0
+    configuration-as-code:2031.veb_a_fdda_b_3ffd
+    oic-auth:4.626.ve5a_d9f26c051
+    job-dsl:1.93
+    simple-theme-plugin:230.v8b_fd91b_b_800c
+    workflow-multibranch:821.vc3b_4ea_780798
+    branch-api:2.1268.v044a_87612da_8
+    scm-api:724.v7d839074eb_5c
+    gitea:268.v75e47974c01d
+    gitea-checks:603.621.vc708da_fb_371d
+    multibranch-scan-webhook-trigger:1.0.11
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index fdb8d107..c82a6af9 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -22,23 +22,26 @@ spec:
         vault.hashicorp.com/role: "jenkins"
         vault.hashicorp.com/agent-inject-secret-jenkins-env: "kv/data/atlas/jenkins/jenkins-oidc"
         vault.hashicorp.com/agent-inject-template-jenkins-env: |
-          {{- with secret "kv/data/atlas/jenkins/jenkins-oidc" -}}
+          {{ with secret "kv/data/atlas/jenkins/jenkins-oidc" }}
           OIDC_CLIENT_ID={{ .Data.data.clientId }}
           OIDC_CLIENT_SECRET={{ .Data.data.clientSecret }}
           OIDC_AUTH_URL={{ .Data.data.authorizationUrl }}
           OIDC_TOKEN_URL={{ .Data.data.tokenUrl }}
           OIDC_USERINFO_URL={{ .Data.data.userInfoUrl }}
           OIDC_LOGOUT_URL={{ .Data.data.logoutUrl }}
-          {{- end }}
-          {{- with secret "kv/data/atlas/jenkins/harbor-robot-creds" -}}
+          {{ end }}
+          {{ with secret "kv/data/atlas/jenkins/harbor-robot-creds" }}
           HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
           HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
-          {{- end }}
-          {{- with secret "kv/data/atlas/jenkins/gitea-pat" -}}
+          {{ end }}
+          {{ with secret "kv/data/atlas/jenkins/gitea-pat" }}
           GITEA_PAT_USERNAME={{ .Data.data.username }}
           GITEA_PAT_TOKEN={{ .Data.data.token }}
-          {{- end -}}
-        bstein.dev/restarted-at: "2026-01-20T05:05:00Z"
+          {{ end }}
+          {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
+          TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
+          {{ end }}
+        bstein.dev/restarted-at: "2026-01-20T13:10:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From 6d83204c9c5e212849634fd01c57f3b2ab8d6473 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 10:23:08 -0300
Subject: [PATCH 042/416] jenkins: pin oic-auth for core 2.528.3

---
 services/jenkins/configmap-plugins.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/jenkins/configmap-plugins.yaml b/services/jenkins/configmap-plugins.yaml
index 35295126..1c43cfb2 100644
--- a/services/jenkins/configmap-plugins.yaml
+++ b/services/jenkins/configmap-plugins.yaml
@@ -11,7 +11,7 @@ data:
     git:5.8.1
     pipeline-utility-steps:2.20.0
     configuration-as-code:2031.veb_a_fdda_b_3ffd
-    oic-auth:4.626.ve5a_d9f26c051
+    oic-auth:4.609.v9de140f63d01
     job-dsl:1.93
     simple-theme-plugin:230.v8b_fd91b_b_800c
     workflow-multibranch:821.vc3b_4ea_780798

From 7c9f7da361e3175dd59382c393d0b42ed5c799d7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 10:31:30 -0300
Subject: [PATCH 043/416] jenkins: fix webhook trigger DSL

---
 services/jenkins/configmap-jcasc.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index ca3a7228..7e6df319 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -186,11 +186,10 @@ data:
               }
             }
             configure { node ->
-              def token = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: ''
+              def webhookToken = System.getenv('TITAN_IAC_WEBHOOK_TOKEN') ?: ''
               def triggers = node / 'triggers'
-              triggers << 'com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger' {
-                token(token)
-              }
+              def webhook = triggers.appendNode('com.igalg.jenkins.plugins.mswt.trigger.ComputedFolderWebHookTrigger')
+              webhook.appendNode('token', webhookToken)
             }
           }
   base.yaml: |

From a7bc174db1844bd4eef8e7ebc8f74f570c0bef8d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 10:37:57 -0300
Subject: [PATCH 044/416] jenkins: clean legacy quality-gate job

---
 services/jenkins/deployment.yaml            |  2 +-
 services/jenkins/kustomization.yaml         |  1 +
 services/jenkins/scripts/job_cleanup.groovy | 13 +++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 services/jenkins/scripts/job_cleanup.groovy

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index c82a6af9..c71812ae 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -41,7 +41,7 @@ spec:
           {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
           TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
           {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T13:10:00Z"
+        bstein.dev/restarted-at: "2026-01-20T13:45:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index acb6fb43..987e842b 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -16,6 +16,7 @@ configMapGenerator:
   - name: jenkins-init-scripts
     namespace: jenkins
     files:
+      - job_cleanup.groovy=scripts/job_cleanup.groovy
       - theme.groovy=scripts/theme.groovy
     options:
       disableNameSuffixHash: true
diff --git a/services/jenkins/scripts/job_cleanup.groovy b/services/jenkins/scripts/job_cleanup.groovy
new file mode 100644
index 00000000..f123c6b7
--- /dev/null
+++ b/services/jenkins/scripts/job_cleanup.groovy
@@ -0,0 +1,13 @@
+import jenkins.branch.MultiBranchProject
+import jenkins.model.Jenkins
+
+def jenkins = Jenkins.instance
+if (jenkins == null) {
+  return
+}
+
+def legacy = jenkins.getItemByFullName('titan-iac-quality-gate')
+if (legacy != null && !(legacy instanceof MultiBranchProject)) {
+  legacy.delete()
+  println("Deleted legacy job titan-iac-quality-gate (non-multibranch)")
+}

From ee5a4aedac58cbf44a37dbdac2c77c9dec4005b5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 10:59:51 -0300
Subject: [PATCH 045/416] jenkins: drop legacy cleanup and update triggers

---
 services/jenkins/configmap-jcasc.yaml       | 40 +++++++++++++++------
 services/jenkins/deployment.yaml            |  2 +-
 services/jenkins/kustomization.yaml         |  1 -
 services/jenkins/scripts/job_cleanup.groovy | 13 -------
 4 files changed, 31 insertions(+), 25 deletions(-)
 delete mode 100644 services/jenkins/scripts/job_cleanup.groovy

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 7e6df319..ba0ac810 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -49,8 +49,12 @@ data:
     jobs:
       - script: |
           pipelineJob('harbor-arm-build') {
-            triggers {
-              scm('H/5 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scm('H/5 * * * *')
+                }
+              }
             }
             definition {
               cpsScm {
@@ -83,8 +87,12 @@ data:
             }
           }
           pipelineJob('ci-demo') {
-            triggers {
-              scm('H/1 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scm('H/1 * * * *')
+                }
+              }
             }
             definition {
               cpsScm {
@@ -102,8 +110,12 @@ data:
             }
           }
           pipelineJob('bstein-dev-home') {
-            triggers {
-              scm('H/2 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scm('H/2 * * * *')
+                }
+              }
             }
             definition {
               cpsScm {
@@ -121,8 +133,12 @@ data:
             }
           }
           pipelineJob('ariadne') {
-            triggers {
-              scm('H/2 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scm('H/2 * * * *')
+                }
+              }
             }
             definition {
               cpsScm {
@@ -140,8 +156,12 @@ data:
             }
           }
           pipelineJob('data-prepper') {
-            triggers {
-              scm('H/5 * * * *')
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scm('H/5 * * * *')
+                }
+              }
             }
             definition {
               cpsScm {
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index c71812ae..9e83686e 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -41,7 +41,7 @@ spec:
           {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
           TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
           {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T13:45:00Z"
+        bstein.dev/restarted-at: "2026-01-20T14:05:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index 987e842b..acb6fb43 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -16,7 +16,6 @@ configMapGenerator:
   - name: jenkins-init-scripts
     namespace: jenkins
     files:
-      - job_cleanup.groovy=scripts/job_cleanup.groovy
       - theme.groovy=scripts/theme.groovy
     options:
       disableNameSuffixHash: true
diff --git a/services/jenkins/scripts/job_cleanup.groovy b/services/jenkins/scripts/job_cleanup.groovy
deleted file mode 100644
index f123c6b7..00000000
--- a/services/jenkins/scripts/job_cleanup.groovy
+++ /dev/null
@@ -1,13 +0,0 @@
-import jenkins.branch.MultiBranchProject
-import jenkins.model.Jenkins
-
-def jenkins = Jenkins.instance
-if (jenkins == null) {
-  return
-}
-
-def legacy = jenkins.getItemByFullName('titan-iac-quality-gate')
-if (legacy != null && !(legacy instanceof MultiBranchProject)) {
-  legacy.delete()
-  println("Deleted legacy job titan-iac-quality-gate (non-multibranch)")
-}

From 33c329a494a4d356d01b99c5ea48f26d5ab1820c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 11:07:54 -0300
Subject: [PATCH 046/416] jenkins: use pollSCM for pipeline triggers

---
 services/jenkins/configmap-jcasc.yaml | 10 +++++-----
 services/jenkins/deployment.yaml      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index ba0ac810..71826ff0 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -52,7 +52,7 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  scm('H/5 * * * *')
+                  pollSCM('H/5 * * * *')
                 }
               }
             }
@@ -90,7 +90,7 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  scm('H/1 * * * *')
+                  pollSCM('H/1 * * * *')
                 }
               }
             }
@@ -113,7 +113,7 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  scm('H/2 * * * *')
+                  pollSCM('H/2 * * * *')
                 }
               }
             }
@@ -136,7 +136,7 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  scm('H/2 * * * *')
+                  pollSCM('H/2 * * * *')
                 }
               }
             }
@@ -159,7 +159,7 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  scm('H/5 * * * *')
+                  pollSCM('H/5 * * * *')
                 }
               }
             }
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 9e83686e..cab36211 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -41,7 +41,7 @@ spec:
           {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
           TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
           {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T14:05:00Z"
+        bstein.dev/restarted-at: "2026-01-20T14:15:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From dfe2faef5c09fa5ae86e8195d28646b507cad0a9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 11:14:29 -0300
Subject: [PATCH 047/416] jenkins: use scmTrigger for pipeline polls

---
 services/jenkins/configmap-jcasc.yaml | 25 ++++++++++++++++++++-----
 services/jenkins/deployment.yaml      |  2 +-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 71826ff0..aa279e91 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -52,7 +52,10 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  pollSCM('H/5 * * * *')
+                  scmTrigger {
+                    spec('H/5 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
                 }
               }
             }
@@ -90,7 +93,10 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  pollSCM('H/1 * * * *')
+                  scmTrigger {
+                    spec('H/1 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
                 }
               }
             }
@@ -113,7 +119,10 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  pollSCM('H/2 * * * *')
+                  scmTrigger {
+                    spec('H/2 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
                 }
               }
             }
@@ -136,7 +145,10 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  pollSCM('H/2 * * * *')
+                  scmTrigger {
+                    spec('H/2 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
                 }
               }
             }
@@ -159,7 +171,10 @@ data:
             properties {
               pipelineTriggers {
                 triggers {
-                  pollSCM('H/5 * * * *')
+                  scmTrigger {
+                    spec('H/5 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
                 }
               }
             }
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index cab36211..7706807b 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -41,7 +41,7 @@ spec:
           {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
           TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
           {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T14:15:00Z"
+        bstein.dev/restarted-at: "2026-01-20T14:25:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From a0ff159cabaa204fec41ff09d2305e9b43740a9c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 11:23:06 -0300
Subject: [PATCH 048/416] jenkins: fix scmTrigger spec field

---
 services/jenkins/configmap-jcasc.yaml | 10 +++++-----
 services/jenkins/deployment.yaml      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index aa279e91..e29c1436 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -53,7 +53,7 @@ data:
               pipelineTriggers {
                 triggers {
                   scmTrigger {
-                    spec('H/5 * * * *')
+                    scmpoll_spec('H/5 * * * *')
                     ignorePostCommitHooks(false)
                   }
                 }
@@ -94,7 +94,7 @@ data:
               pipelineTriggers {
                 triggers {
                   scmTrigger {
-                    spec('H/1 * * * *')
+                    scmpoll_spec('H/1 * * * *')
                     ignorePostCommitHooks(false)
                   }
                 }
@@ -120,7 +120,7 @@ data:
               pipelineTriggers {
                 triggers {
                   scmTrigger {
-                    spec('H/2 * * * *')
+                    scmpoll_spec('H/2 * * * *')
                     ignorePostCommitHooks(false)
                   }
                 }
@@ -146,7 +146,7 @@ data:
               pipelineTriggers {
                 triggers {
                   scmTrigger {
-                    spec('H/2 * * * *')
+                    scmpoll_spec('H/2 * * * *')
                     ignorePostCommitHooks(false)
                   }
                 }
@@ -172,7 +172,7 @@ data:
               pipelineTriggers {
                 triggers {
                   scmTrigger {
-                    spec('H/5 * * * *')
+                    scmpoll_spec('H/5 * * * *')
                     ignorePostCommitHooks(false)
                   }
                 }
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 7706807b..44925798 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -41,7 +41,7 @@ spec:
           {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
           TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
           {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T14:25:00Z"
+        bstein.dev/restarted-at: "2026-01-20T14:35:00Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:

From e81eb57af3077232443af0cba16a30854f87512c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 11:54:15 -0300
Subject: [PATCH 049/416] jenkins: automate notifyCommit token

---
 services/jenkins/deployment.yaml              |  3 +-
 services/jenkins/kustomization.yaml           |  1 +
 .../jenkins/scripts/git-notify-token.groovy   | 41 +++++++++++++++++++
 services/jenkins/scripts/theme.groovy         |  1 -
 4 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 services/jenkins/scripts/git-notify-token.groovy

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 44925798..b5b3de63 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -40,8 +40,9 @@ spec:
           {{ end }}
           {{ with secret "kv/data/atlas/jenkins/webhook-tokens" }}
           TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
+          GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
           {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T14:35:00Z"
+        bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
     spec:
       serviceAccountName: jenkins
       nodeSelector:
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index acb6fb43..0a03f5b5 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -16,6 +16,7 @@ configMapGenerator:
   - name: jenkins-init-scripts
     namespace: jenkins
     files:
+      - git-notify-token.groovy=scripts/git-notify-token.groovy
       - theme.groovy=scripts/theme.groovy
     options:
       disableNameSuffixHash: true
diff --git a/services/jenkins/scripts/git-notify-token.groovy b/services/jenkins/scripts/git-notify-token.groovy
new file mode 100644
index 00000000..336c918a
--- /dev/null
+++ b/services/jenkins/scripts/git-notify-token.groovy
@@ -0,0 +1,41 @@
+import hudson.plugins.git.ApiTokenPropertyConfiguration
+import hudson.Util
+import java.nio.charset.StandardCharsets
+import java.security.MessageDigest
+
+
+def entries = [
+  [env: 'GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME', name: 'gitea-bstein-dev-home'],
+]
+
+entries.each { entry ->
+  def token = System.getenv(entry.env)
+  if (!token || token.trim().isEmpty()) {
+    println("Git notifyCommit token ${entry.env} missing; skipping")
+    return
+  }
+
+  try {
+    def config = ApiTokenPropertyConfiguration.get()
+    if (config.hasMatchingApiToken(token)) {
+      println("Git notifyCommit token ${entry.name} already configured")
+      return
+    }
+
+    def digest = MessageDigest.getInstance("SHA-256")
+    def hash = Util.toHexString(digest.digest(token.getBytes(StandardCharsets.US_ASCII)))
+
+    def field = ApiTokenPropertyConfiguration.class.getDeclaredField("apiTokens")
+    field.setAccessible(true)
+    def tokens = field.get(config)
+
+    def ctor = ApiTokenPropertyConfiguration.HashedApiToken.class.getDeclaredConstructor(String.class, String.class)
+    ctor.setAccessible(true)
+    tokens.add(ctor.newInstance(entry.name, hash))
+    config.save()
+
+    println("Added git notifyCommit access token ${entry.name}")
+  } catch (Throwable e) {
+    println("Failed to configure git notifyCommit token ${entry.name}: ${e.class.simpleName}: ${e.message}")
+  }
+}
diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy
index cf171f74..5950bf44 100644
--- a/services/jenkins/scripts/theme.groovy
+++ b/services/jenkins/scripts/theme.groovy
@@ -8,7 +8,6 @@ if (decorators?.size() > 0) {
   def theme = decorators[0]
   theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css")
   theme.setJsUrl("")
-  theme.setTheme("")
   instance.save()
   println("Applied simple-theme-plugin dark theme")
 } else {

From f3b8b93287682b437b54b11646f536d2c9bf9b52 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:04:24 -0300
Subject: [PATCH 050/416] jenkins: move agent workspace off node disk

---
 services/jenkins/cache-pvc.yaml       | 13 +++++++++++++
 services/jenkins/configmap-jcasc.yaml |  5 +++++
 services/jenkins/deployment.yaml      |  6 ++++--
 services/jenkins/kustomization.yaml   |  2 ++
 services/jenkins/plugins-pvc.yaml     | 13 +++++++++++++
 5 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 services/jenkins/cache-pvc.yaml
 create mode 100644 services/jenkins/plugins-pvc.yaml

diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml
new file mode 100644
index 00000000..784c7d8d
--- /dev/null
+++ b/services/jenkins/cache-pvc.yaml
@@ -0,0 +1,13 @@
+# services/jenkins/cache-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: jenkins-cache
+  namespace: jenkins
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+  storageClassName: astreae
diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index e29c1436..f485de81 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -258,6 +258,11 @@ data:
           templates:
             - name: "default"
               namespace: "jenkins"
+              workspaceVolume:
+                dynamicPVC:
+                  accessModes: "ReadWriteOnce"
+                  requestsSize: "5Gi"
+                  storageClassName: "astreae"
               containers:
               - name: "jnlp"
                 args: "^${computer.jnlpmac} ^${computer.name}"
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index b5b3de63..7ee1aad4 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -161,9 +161,11 @@ spec:
           persistentVolumeClaim:
             claimName: jenkins
         - name: jenkins-cache
-          emptyDir: {}
+          persistentVolumeClaim:
+            claimName: jenkins-cache
         - name: plugin-dir
-          emptyDir: {}
+          persistentVolumeClaim:
+            claimName: jenkins-plugins
         - name: plugins
           configMap:
             name: jenkins-plugins
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index 0a03f5b5..aab859ab 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -6,6 +6,8 @@ resources:
   - namespace.yaml
   - serviceaccount.yaml
   - pvc.yaml
+  - cache-pvc.yaml
+  - plugins-pvc.yaml
   - configmap-jcasc.yaml
   - configmap-plugins.yaml
   - deployment.yaml
diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml
new file mode 100644
index 00000000..45a967bb
--- /dev/null
+++ b/services/jenkins/plugins-pvc.yaml
@@ -0,0 +1,13 @@
+# services/jenkins/plugins-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: jenkins-plugins
+  namespace: jenkins
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 2Gi
+  storageClassName: astreae

From 5e2370aeaac5f21ebdf4f4c90226ad579b225e67 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:09:23 -0300
Subject: [PATCH 051/416] jenkins: expand pvc sizes and move /tmp to memory

---
 services/jenkins/cache-pvc.yaml       | 2 +-
 services/jenkins/configmap-jcasc.yaml | 2 +-
 services/jenkins/deployment.yaml      | 3 ++-
 services/jenkins/plugins-pvc.yaml     | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml
index 784c7d8d..75383059 100644
--- a/services/jenkins/cache-pvc.yaml
+++ b/services/jenkins/cache-pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 5Gi
+      storage: 50Gi
   storageClassName: astreae
diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index f485de81..5ee6a3e7 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -261,7 +261,7 @@ data:
               workspaceVolume:
                 dynamicPVC:
                   accessModes: "ReadWriteOnce"
-                  requestsSize: "5Gi"
+                  requestsSize: "50Gi"
                   storageClassName: "astreae"
               containers:
               - name: "jnlp"
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 7ee1aad4..5f50084e 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -176,4 +176,5 @@ spec:
           configMap:
             name: jenkins-init-scripts
         - name: tmp
-          emptyDir: {}
+          emptyDir:
+            medium: Memory
diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml
index 45a967bb..2812c7a6 100644
--- a/services/jenkins/plugins-pvc.yaml
+++ b/services/jenkins/plugins-pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 2Gi
+      storage: 20Gi
   storageClassName: astreae

From 0850fd86ee821f2769fd9580687923842f358fdb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:19:58 -0300
Subject: [PATCH 052/416] jenkins: right-size pvc requests

---
 services/jenkins/cache-pvc.yaml       | 2 +-
 services/jenkins/configmap-jcasc.yaml | 2 +-
 services/jenkins/plugins-pvc.yaml     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml
index 75383059..79e8decb 100644
--- a/services/jenkins/cache-pvc.yaml
+++ b/services/jenkins/cache-pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 50Gi
+      storage: 20Gi
   storageClassName: astreae
diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml
index 5ee6a3e7..c2144fa0 100644
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@@ -261,7 +261,7 @@ data:
               workspaceVolume:
                 dynamicPVC:
                   accessModes: "ReadWriteOnce"
-                  requestsSize: "50Gi"
+                  requestsSize: "20Gi"
                   storageClassName: "astreae"
               containers:
               - name: "jnlp"
diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml
index 2812c7a6..e26d07fd 100644
--- a/services/jenkins/plugins-pvc.yaml
+++ b/services/jenkins/plugins-pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 20Gi
+      storage: 10Gi
   storageClassName: astreae

From fc240e34fe67a8c0ad90a8cd057e7f7309ffdbc9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:21:42 -0300
Subject: [PATCH 053/416] jenkins: keep cache/plugin pvc sizes to avoid shrink

---
 services/jenkins/cache-pvc.yaml   | 2 +-
 services/jenkins/plugins-pvc.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml
index 79e8decb..75383059 100644
--- a/services/jenkins/cache-pvc.yaml
+++ b/services/jenkins/cache-pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 20Gi
+      storage: 50Gi
   storageClassName: astreae
diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml
index e26d07fd..2812c7a6 100644
--- a/services/jenkins/plugins-pvc.yaml
+++ b/services/jenkins/plugins-pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 20Gi
   storageClassName: astreae

From 3cd38a6c701e9897a0d6e602bced499ec9905199 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:32:27 -0300
Subject: [PATCH 054/416] jenkins: rotate cache/plugin pvcs

---
 services/jenkins/cache-pvc.yaml   | 4 ++--
 services/jenkins/deployment.yaml  | 4 ++--
 services/jenkins/plugins-pvc.yaml | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/services/jenkins/cache-pvc.yaml b/services/jenkins/cache-pvc.yaml
index 75383059..a9ed319f 100644
--- a/services/jenkins/cache-pvc.yaml
+++ b/services/jenkins/cache-pvc.yaml
@@ -2,12 +2,12 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: jenkins-cache
+  name: jenkins-cache-v2
   namespace: jenkins
 spec:
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 50Gi
+      storage: 20Gi
   storageClassName: astreae
diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 5f50084e..9f8fe99f 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -162,10 +162,10 @@ spec:
             claimName: jenkins
         - name: jenkins-cache
           persistentVolumeClaim:
-            claimName: jenkins-cache
+            claimName: jenkins-cache-v2
         - name: plugin-dir
           persistentVolumeClaim:
-            claimName: jenkins-plugins
+            claimName: jenkins-plugins-v2
         - name: plugins
           configMap:
             name: jenkins-plugins
diff --git a/services/jenkins/plugins-pvc.yaml b/services/jenkins/plugins-pvc.yaml
index 2812c7a6..06715eb4 100644
--- a/services/jenkins/plugins-pvc.yaml
+++ b/services/jenkins/plugins-pvc.yaml
@@ -2,12 +2,12 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: jenkins-plugins
+  name: jenkins-plugins-v2
   namespace: jenkins
 spec:
   accessModes:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 20Gi
+      storage: 10Gi
   storageClassName: astreae

From 50dcded32f09cdff6a99cb924b99c0e0faccf059 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:43:23 -0300
Subject: [PATCH 055/416] jenkins: add local dark theme css

---
 services/jenkins/deployment.yaml           |  4 +
 services/jenkins/kustomization.yaml        |  1 +
 services/jenkins/scripts/jenkins-theme.css | 97 ++++++++++++++++++++++
 services/jenkins/scripts/theme.groovy      |  2 +-
 4 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 services/jenkins/scripts/jenkins-theme.css

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 9f8fe99f..b69f134a 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -94,6 +94,7 @@ spec:
             - -c
             - |
               set -e
+              mkdir -p /var/jenkins_home/userContent
               exec env $(cat /vault/secrets/jenkins-env) /usr/bin/tini -- /usr/local/bin/jenkins.sh
           ports:
             - name: http
@@ -152,6 +153,9 @@ spec:
               mountPath: /config/jcasc
             - name: init-scripts
               mountPath: /usr/share/jenkins/ref/init.groovy.d
+            - name: init-scripts
+              mountPath: /var/jenkins_home/userContent/jenkins-theme.css
+              subPath: jenkins-theme.css
             - name: plugin-dir
               mountPath: /usr/share/jenkins/ref/plugins
             - name: tmp
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index aab859ab..444dd6de 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -18,6 +18,7 @@ configMapGenerator:
   - name: jenkins-init-scripts
     namespace: jenkins
     files:
+      - jenkins-theme.css=scripts/jenkins-theme.css
       - git-notify-token.groovy=scripts/git-notify-token.groovy
       - theme.groovy=scripts/theme.groovy
     options:
diff --git a/services/jenkins/scripts/jenkins-theme.css b/services/jenkins/scripts/jenkins-theme.css
new file mode 100644
index 00000000..56fe193f
--- /dev/null
+++ b/services/jenkins/scripts/jenkins-theme.css
@@ -0,0 +1,97 @@
+@import url("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css");
+
+:root {
+  --atlas-bg: #0f1216;
+  --atlas-surface: #171b21;
+  --atlas-surface-alt: #1f252d;
+  --atlas-border: #2b313b;
+  --atlas-text: #e6e9ef;
+  --atlas-text-muted: #b3bac6;
+  --atlas-link: #8fb7ff;
+}
+
+body,
+#page-body,
+#page-header,
+#header,
+#main-panel,
+#main-panel-content,
+#side-panel,
+.top-sticker-inner,
+.bottom-sticker-inner,
+#breadcrumbBar,
+#breadcrumbs {
+  background-color: var(--atlas-bg) !important;
+  color: var(--atlas-text) !important;
+}
+
+#side-panel .task-link,
+#breadcrumbs a,
+#breadcrumbs,
+#projectstatus th a,
+#projectstatus td,
+#projectstatus th {
+  color: var(--atlas-text-muted) !important;
+}
+
+a,
+a:visited,
+a:link {
+  color: var(--atlas-link) !important;
+}
+
+a:hover {
+  opacity: 0.85;
+}
+
+#main-panel,
+#main-panel-content,
+#description,
+.pane,
+table.pane {
+  background-color: var(--atlas-surface) !important;
+  color: var(--atlas-text) !important;
+}
+
+table.pane tr:nth-child(odd) td {
+  background-color: var(--atlas-surface) !important;
+}
+
+table.pane tr:nth-child(even) td,
+#projectstatus tr:hover td {
+  background-color: var(--atlas-surface-alt) !important;
+}
+
+input,
+select,
+textarea,
+#search-box {
+  background-color: var(--atlas-surface-alt) !important;
+  color: var(--atlas-text) !important;
+  border-color: var(--atlas-border) !important;
+}
+
+#header,
+#page-header {
+  background-color: #202734 !important;
+}
+
+#header .login,
+#page-header .login {
+  color: var(--atlas-text) !important;
+}
+
+#side-panel .task-link,
+#side-panel .task-link:visited,
+#side-panel .task-link:hover {
+  color: var(--atlas-text) !important;
+}
+
+#footer {
+  background-color: var(--atlas-bg) !important;
+  color: var(--atlas-text-muted) !important;
+}
+
+.jenkins_ver:after {
+  content: "atlas dark";
+}
diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy
index 5950bf44..fd12474e 100644
--- a/services/jenkins/scripts/theme.groovy
+++ b/services/jenkins/scripts/theme.groovy
@@ -6,7 +6,7 @@ def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
 
 if (decorators?.size() > 0) {
   def theme = decorators[0]
-  theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css")
+  theme.setCssUrl("https://ci.bstein.dev/userContent/jenkins-theme.css")
   theme.setJsUrl("")
   instance.save()
   println("Applied simple-theme-plugin dark theme")

From fced9f5919a0fe477b24cbefce53d43aeac5fd94 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 17:54:47 -0300
Subject: [PATCH 056/416] jenkins: mount init scripts into home

---
 services/jenkins/deployment.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index b69f134a..7dff5afd 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -153,6 +153,8 @@ spec:
               mountPath: /config/jcasc
             - name: init-scripts
               mountPath: /usr/share/jenkins/ref/init.groovy.d
+            - name: init-scripts
+              mountPath: /var/jenkins_home/init.groovy.d
             - name: init-scripts
               mountPath: /var/jenkins_home/userContent/jenkins-theme.css
               subPath: jenkins-theme.css

From a01dbbd7df85233feddbec7d63d84939f65cf9d0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 18:00:36 -0300
Subject: [PATCH 057/416] jenkins: inline dark theme css

---
 services/jenkins/deployment.yaml           |  4 -
 services/jenkins/kustomization.yaml        |  1 -
 services/jenkins/scripts/jenkins-theme.css | 97 ---------------------
 services/jenkins/scripts/theme.groovy      | 99 +++++++++++++++++++++-
 4 files changed, 98 insertions(+), 103 deletions(-)
 delete mode 100644 services/jenkins/scripts/jenkins-theme.css

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 7dff5afd..0b62ee09 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -94,7 +94,6 @@ spec:
             - -c
             - |
               set -e
-              mkdir -p /var/jenkins_home/userContent
               exec env $(cat /vault/secrets/jenkins-env) /usr/bin/tini -- /usr/local/bin/jenkins.sh
           ports:
             - name: http
@@ -155,9 +154,6 @@ spec:
               mountPath: /usr/share/jenkins/ref/init.groovy.d
             - name: init-scripts
               mountPath: /var/jenkins_home/init.groovy.d
-            - name: init-scripts
-              mountPath: /var/jenkins_home/userContent/jenkins-theme.css
-              subPath: jenkins-theme.css
             - name: plugin-dir
               mountPath: /usr/share/jenkins/ref/plugins
             - name: tmp
diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index 444dd6de..aab859ab 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -18,7 +18,6 @@ configMapGenerator:
   - name: jenkins-init-scripts
     namespace: jenkins
     files:
-      - jenkins-theme.css=scripts/jenkins-theme.css
       - git-notify-token.groovy=scripts/git-notify-token.groovy
       - theme.groovy=scripts/theme.groovy
     options:
diff --git a/services/jenkins/scripts/jenkins-theme.css b/services/jenkins/scripts/jenkins-theme.css
deleted file mode 100644
index 56fe193f..00000000
--- a/services/jenkins/scripts/jenkins-theme.css
+++ /dev/null
@@ -1,97 +0,0 @@
-@import url("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css");
-
-:root {
-  --atlas-bg: #0f1216;
-  --atlas-surface: #171b21;
-  --atlas-surface-alt: #1f252d;
-  --atlas-border: #2b313b;
-  --atlas-text: #e6e9ef;
-  --atlas-text-muted: #b3bac6;
-  --atlas-link: #8fb7ff;
-}
-
-body,
-#page-body,
-#page-header,
-#header,
-#main-panel,
-#main-panel-content,
-#side-panel,
-.top-sticker-inner,
-.bottom-sticker-inner,
-#breadcrumbBar,
-#breadcrumbs {
-  background-color: var(--atlas-bg) !important;
-  color: var(--atlas-text) !important;
-}
-
-#side-panel .task-link,
-#breadcrumbs a,
-#breadcrumbs,
-#projectstatus th a,
-#projectstatus td,
-#projectstatus th {
-  color: var(--atlas-text-muted) !important;
-}
-
-a,
-a:visited,
-a:link {
-  color: var(--atlas-link) !important;
-}
-
-a:hover {
-  opacity: 0.85;
-}
-
-#main-panel,
-#main-panel-content,
-#description,
-.pane,
-table.pane {
-  background-color: var(--atlas-surface) !important;
-  color: var(--atlas-text) !important;
-}
-
-table.pane tr:nth-child(odd) td {
-  background-color: var(--atlas-surface) !important;
-}
-
-table.pane tr:nth-child(even) td,
-#projectstatus tr:hover td {
-  background-color: var(--atlas-surface-alt) !important;
-}
-
-input,
-select,
-textarea,
-#search-box {
-  background-color: var(--atlas-surface-alt) !important;
-  color: var(--atlas-text) !important;
-  border-color: var(--atlas-border) !important;
-}
-
-#header,
-#page-header {
-  background-color: #202734 !important;
-}
-
-#header .login,
-#page-header .login {
-  color: var(--atlas-text) !important;
-}
-
-#side-panel .task-link,
-#side-panel .task-link:visited,
-#side-panel .task-link:hover {
-  color: var(--atlas-text) !important;
-}
-
-#footer {
-  background-color: var(--atlas-bg) !important;
-  color: var(--atlas-text-muted) !important;
-}
-
-.jenkins_ver:after {
-  content: "atlas dark";
-}
diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy
index fd12474e..b20169cb 100644
--- a/services/jenkins/scripts/theme.groovy
+++ b/services/jenkins/scripts/theme.groovy
@@ -6,7 +6,104 @@ def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
 
 if (decorators?.size() > 0) {
   def theme = decorators[0]
-  theme.setCssUrl("https://ci.bstein.dev/userContent/jenkins-theme.css")
+  theme.setCssUrl("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css")
+  theme.setCssRules("""
+:root {
+  --atlas-bg: #0f1216;
+  --atlas-surface: #171b21;
+  --atlas-surface-alt: #1f252d;
+  --atlas-border: #2b313b;
+  --atlas-text: #e6e9ef;
+  --atlas-text-muted: #b3bac6;
+  --atlas-link: #8fb7ff;
+}
+
+body,
+#page-body,
+#page-header,
+#header,
+#main-panel,
+#main-panel-content,
+#side-panel,
+.top-sticker-inner,
+.bottom-sticker-inner,
+#breadcrumbBar,
+#breadcrumbs {
+  background-color: var(--atlas-bg) !important;
+  color: var(--atlas-text) !important;
+}
+
+#side-panel .task-link,
+#breadcrumbs a,
+#breadcrumbs,
+#projectstatus th a,
+#projectstatus td,
+#projectstatus th {
+  color: var(--atlas-text-muted) !important;
+}
+
+a,
+a:visited,
+a:link {
+  color: var(--atlas-link) !important;
+}
+
+a:hover {
+  opacity: 0.85;
+}
+
+#main-panel,
+#main-panel-content,
+#description,
+.pane,
+table.pane {
+  background-color: var(--atlas-surface) !important;
+  color: var(--atlas-text) !important;
+}
+
+table.pane tr:nth-child(odd) td {
+  background-color: var(--atlas-surface) !important;
+}
+
+table.pane tr:nth-child(even) td,
+#projectstatus tr:hover td {
+  background-color: var(--atlas-surface-alt) !important;
+}
+
+input,
+select,
+textarea,
+#search-box {
+  background-color: var(--atlas-surface-alt) !important;
+  color: var(--atlas-text) !important;
+  border-color: var(--atlas-border) !important;
+}
+
+#header,
+#page-header {
+  background-color: #202734 !important;
+}
+
+#header .login,
+#page-header .login {
+  color: var(--atlas-text) !important;
+}
+
+#side-panel .task-link,
+#side-panel .task-link:visited,
+#side-panel .task-link:hover {
+  color: var(--atlas-text) !important;
+}
+
+#footer {
+  background-color: var(--atlas-bg) !important;
+  color: var(--atlas-text-muted) !important;
+}
+
+.jenkins_ver:after {
+  content: "atlas dark";
+}
+""".stripIndent().trim())
   theme.setJsUrl("")
   instance.save()
   println("Applied simple-theme-plugin dark theme")

From 3a6f1785cc410466eb629b2a2261f87b417da883 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 18:11:13 -0300
Subject: [PATCH 058/416] ci: add root Jenkinsfile and update keycloak ldap job

---
 Jenkinsfile                                | 77 ++++++++++++++++++++++
 services/keycloak/ldap-federation-job.yaml | 50 +++++++++++++-
 2 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 Jenkinsfile

diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 00000000..4d6b23e6
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,77 @@
+// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
+pipeline {
+  agent {
+    kubernetes {
+      defaultContainer 'python'
+      yaml """
+apiVersion: v1
+kind: Pod
+spec:
+  nodeSelector:
+    hardware: rpi5
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: "true"
+  containers:
+    - name: python
+      image: python:3.12-slim
+      command:
+        - cat
+      tty: true
+"""
+    }
+  }
+  environment {
+    PIP_DISABLE_PIP_VERSION_CHECK = '1'
+    PYTHONUNBUFFERED = '1'
+  }
+  stages {
+    stage('Checkout') {
+      steps {
+        checkout scm
+      }
+    }
+    stage('Install deps') {
+      steps {
+        sh 'pip install --no-cache-dir -r ci/requirements.txt'
+      }
+    }
+    stage('Glue tests') {
+      steps {
+        sh 'pytest -q ci/tests/glue'
+      }
+    }
+    stage('Resolve Flux branch') {
+      steps {
+        script {
+          env.FLUX_BRANCH = sh(
+            returnStdout: true,
+            script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
+          ).trim()
+          if (!env.FLUX_BRANCH) {
+            error('Flux branch not found in gotk-sync.yaml')
+          }
+          echo "Flux branch: ${env.FLUX_BRANCH}"
+        }
+      }
+    }
+    stage('Promote') {
+      when {
+        expression {
+          def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
+          return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
+        }
+      }
+      steps {
+        withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
+          sh '''
+            set +x
+            git config user.email "jenkins@bstein.dev"
+            git config user.name "jenkins"
+            git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
+            git push origin HEAD:${FLUX_BRANCH}
+          '''
+        }
+      }
+    }
+  }
+}
diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/ldap-federation-job.yaml
index 303fd9f5..3c3f1c19 100644
--- a/services/keycloak/ldap-federation-job.yaml
+++ b/services/keycloak/ldap-federation-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-ldap-federation-11
+  name: keycloak-ldap-federation-12
   namespace: sso
 spec:
   backoffLimit: 2
@@ -325,6 +325,54 @@ spec:
                   if status not in (201, 204):
                       raise SystemExit(f"Unexpected group mapper create status: {status}")
 
+              def ensure_user_attr_mapper(name: str, ldap_attr: str, user_attr: str):
+                  mapper = None
+                  for c in components:
+                      if c.get("name") == name and c.get("parentId") == ldap_component_id:
+                          mapper = c
+                          break
+
+                  payload = {
+                      "name": name,
+                      "providerId": "user-attribute-ldap-mapper",
+                      "providerType": "org.keycloak.storage.ldap.mappers.LDAPStorageMapper",
+                      "parentId": ldap_component_id,
+                      "config": {
+                          "ldap.attribute": [ldap_attr],
+                          "user.model.attribute": [user_attr],
+                          "read.only": ["false"],
+                          "always.read.value.from.ldap": ["false"],
+                          "is.mandatory.in.ldap": ["false"],
+                      },
+                  }
+
+                  if mapper:
+                      payload["id"] = mapper["id"]
+                      payload["parentId"] = mapper.get("parentId", payload["parentId"])
+                      print(f"Updating LDAP user mapper: {payload['id']} ({name})")
+                      status, _, _ = http_json(
+                          "PUT",
+                          f"{base_url}/admin/realms/{realm}/components/{payload['id']}",
+                          token,
+                          payload,
+                      )
+                      if status not in (200, 204):
+                          raise SystemExit(f"Unexpected user mapper update status for {name}: {status}")
+                  else:
+                      print(f"Creating LDAP user mapper: {name}")
+                      status, _, _ = http_json(
+                          "POST",
+                          f"{base_url}/admin/realms/{realm}/components",
+                          token,
+                          payload,
+                      )
+                      if status not in (201, 204):
+                          raise SystemExit(f"Unexpected user mapper create status for {name}: {status}")
+
+              ensure_user_attr_mapper("openldap-email", "mail", "email")
+              ensure_user_attr_mapper("openldap-first-name", "givenName", "firstName")
+              ensure_user_attr_mapper("openldap-last-name", "sn", "lastName")
+
               # Cleanup duplicate LDAP federation providers and their child components (mappers, etc).
               # Keep only the canonical provider we updated/created above.
               try:

From cd81dffd857cb62bc90d74a3f94759eb43a36176 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 18:13:49 -0300
Subject: [PATCH 059/416] jenkins: fix dark theme injection

---
 services/jenkins/scripts/theme.groovy | 140 +++++++++++++++-----------
 1 file changed, 83 insertions(+), 57 deletions(-)

diff --git a/services/jenkins/scripts/theme.groovy b/services/jenkins/scripts/theme.groovy
index b20169cb..58755c04 100644
--- a/services/jenkins/scripts/theme.groovy
+++ b/services/jenkins/scripts/theme.groovy
@@ -1,21 +1,46 @@
 import jenkins.model.Jenkins
 import org.codefirst.SimpleThemeDecorator
+import org.jenkinsci.plugins.simpletheme.CssTextThemeElement
 
 def instance = Jenkins.get()
 def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
 
 if (decorators?.size() > 0) {
   def theme = decorators[0]
-  theme.setCssUrl("https://cdn.jsdelivr.net/gh/Jorg3Lucas/jenkins-modern-themes@main/dist/modern-blue-grey.css")
-  theme.setCssRules("""
-:root {
-  --atlas-bg: #0f1216;
-  --atlas-surface: #171b21;
-  --atlas-surface-alt: #1f252d;
-  --atlas-border: #2b313b;
-  --atlas-text: #e6e9ef;
-  --atlas-text-muted: #b3bac6;
-  --atlas-link: #8fb7ff;
+  def cssRules = """
+:root,
+.app-theme-picker__picker[data-theme=none] {
+  --background: #0f1216 !important;
+  --header-background: #141922 !important;
+  --header-border: #2b313b !important;
+  --white: #141922 !important;
+  --black: #e6e9ef !important;
+  --very-light-grey: #171b21 !important;
+  --light-grey: #202734 !important;
+  --medium-grey: #2b313b !important;
+  --dark-grey: #0b0f14 !important;
+  --text-color: #e6e9ef !important;
+  --text-color-secondary: #a6adba !important;
+  --card-background: #171b21 !important;
+  --card-border-color: #2b313b !important;
+  --pane-header-bg: #1f252d !important;
+  --pane-header-border-color: #2b313b !important;
+  --pane-border-color: #2b313b !important;
+  --pane-text-color: #e6e9ef !important;
+  --pane-header-text-color: #e6e9ef !important;
+  --link-color: #8fb7ff !important;
+  --link-color--hover: #b0ccff !important;
+  --link-dark-color: #e6e9ef !important;
+  --link-dark-color--hover: #b0ccff !important;
+  --input-color: #151a20 !important;
+  --input-border: #2b313b !important;
+  --input-border-hover: #3a424d !important;
+  --button-background: #232a33 !important;
+  --button-background--hover: #2b313b !important;
+  --button-background--active: #323b46 !important;
+  --item-background--hover: #232a33 !important;
+  --item-background--active: #2b313b !important;
+  --accent-color: #8fb7ff !important;
 }
 
 body,
@@ -29,83 +54,84 @@ body,
 .bottom-sticker-inner,
 #breadcrumbBar,
 #breadcrumbs {
-  background-color: var(--atlas-bg) !important;
-  color: var(--atlas-text) !important;
+  background-color: var(--background) !important;
+  color: var(--text-color) !important;
 }
 
-#side-panel .task-link,
-#breadcrumbs a,
-#breadcrumbs,
-#projectstatus th a,
+.jenkins-card,
+.jenkins-section,
+.jenkins-section__item,
+#main-panel .jenkins-card,
+#main-panel .jenkins-section {
+  background-color: var(--card-background) !important;
+  color: var(--text-color) !important;
+  border-color: var(--card-border-color) !important;
+}
+
+table.pane,
+table.pane td,
+table.pane th,
 #projectstatus td,
 #projectstatus th {
-  color: var(--atlas-text-muted) !important;
-}
-
-a,
-a:visited,
-a:link {
-  color: var(--atlas-link) !important;
-}
-
-a:hover {
-  opacity: 0.85;
-}
-
-#main-panel,
-#main-panel-content,
-#description,
-.pane,
-table.pane {
-  background-color: var(--atlas-surface) !important;
-  color: var(--atlas-text) !important;
-}
-
-table.pane tr:nth-child(odd) td {
-  background-color: var(--atlas-surface) !important;
+  background-color: var(--card-background) !important;
+  color: var(--text-color) !important;
 }
 
 table.pane tr:nth-child(even) td,
 #projectstatus tr:hover td {
-  background-color: var(--atlas-surface-alt) !important;
+  background-color: #1f252d !important;
 }
 
 input,
 select,
 textarea,
 #search-box {
-  background-color: var(--atlas-surface-alt) !important;
-  color: var(--atlas-text) !important;
-  border-color: var(--atlas-border) !important;
+  background-color: #151a20 !important;
+  color: var(--text-color) !important;
+  border-color: var(--input-border) !important;
 }
 
-#header,
-#page-header {
-  background-color: #202734 !important;
+a,
+a:visited,
+a:link {
+  color: var(--link-color) !important;
 }
 
-#header .login,
-#page-header .login {
-  color: var(--atlas-text) !important;
+a:hover {
+  opacity: 0.85;
 }
 
 #side-panel .task-link,
-#side-panel .task-link:visited,
-#side-panel .task-link:hover {
-  color: var(--atlas-text) !important;
+#breadcrumbs a,
+#breadcrumbs,
+#projectstatus th a {
+  color: var(--text-color-secondary) !important;
+}
+
+.console-output,
+.console-output pre,
+pre,
+code,
+.CodeMirror {
+  background-color: #0c0f14 !important;
+  color: #d9dee7 !important;
 }
 
 #footer {
-  background-color: var(--atlas-bg) !important;
-  color: var(--atlas-text-muted) !important;
+  background-color: var(--background) !important;
+  color: var(--text-color-secondary) !important;
 }
 
 .jenkins_ver:after {
   content: "atlas dark";
 }
-""".stripIndent().trim())
+""".stripIndent().trim()
+
+  theme.setElements([new CssTextThemeElement(cssRules)])
+  theme.setCssUrl("")
+  theme.setCssRules(cssRules)
   theme.setJsUrl("")
-  instance.save()
+  theme.save()
   println("Applied simple-theme-plugin dark theme")
 } else {
   println("simple-theme-plugin not installed; skipping theme configuration")

From 587a0af1d7e30b6b5d094ce55fa6417cfabb47ea Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 20 Jan 2026 23:03:39 -0300
Subject: [PATCH 060/416] maintenance: wire ariadne db and dashboards

---
 scripts/dashboards_render_atlas.py            |  25 +++
 services/maintenance/ariadne-deployment.yaml  | 165 ++++++++++++++----
 services/maintenance/ariadne-rbac.yaml        |  14 +-
 .../monitoring/dashboards/atlas-testing.json  | 113 ++++++++++++
 .../monitoring/grafana-dashboard-testing.yaml | 113 ++++++++++++
 .../vault/scripts/vault_k8s_auth_configure.sh |   2 +-
 6 files changed, 399 insertions(+), 33 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 116bf218..a3fb3727 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -340,6 +340,8 @@ ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{statu
 ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
 ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
+ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
+ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@@ -2267,6 +2269,29 @@ def build_testing_dashboard():
             instant=True,
         )
     )
+    panels.append(
+        stat_panel(
+            10,
+            "Ariadne CI Coverage (%)",
+            ARIADNE_CI_COVERAGE,
+            {"h": 4, "w": 6, "x": 0, "y": 22},
+            unit="percent",
+            decimals=1,
+            instant=True,
+            legend="{{branch}}",
+        )
+    )
+    panels.append(
+        table_panel(
+            11,
+            "Ariadne CI Tests (latest)",
+            ARIADNE_CI_TESTS,
+            {"h": 6, "w": 18, "x": 6, "y": 22},
+            unit="none",
+            transformations=sort_desc,
+            instant=True,
+        )
+    )
 
     return {
         "uid": "atlas-testing",
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index cd0d38c7..57ce72b7 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -20,14 +20,30 @@ spec:
         prometheus.io/path: "/metrics"
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "maintenance"
-        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/portal/atlas-portal-db"
+        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
         vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
-          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
-          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}"
           {{ end }}
           {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }}
           export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
           {{ end }}
+          {{ with secret "kv/data/atlas/nextcloud/nextcloud-db" }}
+          export NEXTCLOUD_DB_NAME="{{ .Data.data.database }}"
+          export NEXTCLOUD_DB_USER="{{ index .Data.data "db-username" }}"
+          export NEXTCLOUD_DB_PASSWORD="{{ index .Data.data "db-password" }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/nextcloud/nextcloud-admin" }}
+          export NEXTCLOUD_ADMIN_USER="{{ index .Data.data "admin-user" }}"
+          export NEXTCLOUD_ADMIN_PASSWORD="{{ index .Data.data "admin-password" }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/health/wger-admin" }}
+          export WGER_ADMIN_USERNAME="{{ .Data.data.username }}"
+          export WGER_ADMIN_PASSWORD="{{ .Data.data.password }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/finance/firefly-secrets" }}
+          export FIREFLY_CRON_TOKEN="{{ .Data.data.STATIC_CRON_TOKEN }}"
+          {{ end }}
           {{ with secret "kv/data/atlas/mailu/mailu-db-secret" }}
           export MAILU_DB_NAME="{{ .Data.data.database }}"
           export MAILU_DB_USER="{{ .Data.data.username }}"
@@ -42,6 +58,35 @@ spec:
           export SMTP_PASSWORD="{{ .Data.data.password }}"
           export SMTP_FROM="no-reply-portal@bstein.dev"
           {{ end }}
+          {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }}
+          export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" }}
+          export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}"
+          export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/comms/synapse-db" }}
+          export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/vault/vault-oidc-config" }}
+          export VAULT_OIDC_DISCOVERY_URL="{{ .Data.data.discovery_url }}"
+          export VAULT_OIDC_CLIENT_ID="{{ .Data.data.client_id }}"
+          export VAULT_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          export VAULT_OIDC_DEFAULT_ROLE="{{ .Data.data.default_role }}"
+          export VAULT_OIDC_SCOPES="{{ .Data.data.scopes }}"
+          export VAULT_OIDC_USER_CLAIM="{{ .Data.data.user_claim }}"
+          export VAULT_OIDC_GROUPS_CLAIM="{{ .Data.data.groups_claim }}"
+          export VAULT_OIDC_TOKEN_POLICIES="{{ .Data.data.token_policies }}"
+          export VAULT_OIDC_ADMIN_GROUP="{{ .Data.data.admin_group }}"
+          export VAULT_OIDC_ADMIN_POLICIES="{{ .Data.data.admin_policies }}"
+          export VAULT_OIDC_DEV_GROUP="{{ .Data.data.dev_group }}"
+          export VAULT_OIDC_DEV_POLICIES="{{ .Data.data.dev_policies }}"
+          export VAULT_OIDC_USER_GROUP="{{ .Data.data.user_group }}"
+          export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}"
+          export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}"
+          export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}"
+          export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}"
+          {{ end }}
     spec:
       serviceAccountName: ariadne
       nodeSelector:
@@ -92,6 +137,8 @@ spec:
               value: dev
             - name: MAILU_DOMAIN
               value: bstein.dev
+            - name: MAILU_HOST
+              value: mail.bstein.dev
             - name: MAILU_SYNC_URL
               value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
             - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC
@@ -102,46 +149,84 @@ spec:
               value: "5432"
             - name: NEXTCLOUD_NAMESPACE
               value: nextcloud
-            - name: NEXTCLOUD_MAIL_SYNC_CRONJOB
-              value: nextcloud-mail-sync
-            - name: NEXTCLOUD_MAIL_SYNC_WAIT_TIMEOUT_SEC
-              value: "90"
-            - name: NEXTCLOUD_MAIL_SYNC_JOB_TTL_SEC
-              value: "3600"
+            - name: NEXTCLOUD_POD_LABEL
+              value: app=nextcloud
+            - name: NEXTCLOUD_CONTAINER
+              value: nextcloud
+            - name: NEXTCLOUD_EXEC_TIMEOUT_SEC
+              value: "120"
+            - name: NEXTCLOUD_URL
+              value: https://cloud.bstein.dev
+            - name: NEXTCLOUD_DB_HOST
+              value: postgres-service.postgres.svc.cluster.local
+            - name: NEXTCLOUD_DB_PORT
+              value: "5432"
             - name: WGER_NAMESPACE
               value: health
-            - name: WGER_USER_SYNC_CRONJOB
-              value: wger-user-sync
-            - name: WGER_ADMIN_CRONJOB
-              value: wger-admin-ensure
             - name: WGER_USER_SYNC_WAIT_TIMEOUT_SEC
               value: "90"
+            - name: WGER_POD_LABEL
+              value: app=wger
+            - name: WGER_CONTAINER
+              value: wger
+            - name: WGER_ADMIN_EMAIL
+              value: brad@bstein.dev
             - name: FIREFLY_NAMESPACE
               value: finance
-            - name: FIREFLY_USER_SYNC_CRONJOB
-              value: firefly-user-sync
             - name: FIREFLY_USER_SYNC_WAIT_TIMEOUT_SEC
               value: "90"
+            - name: FIREFLY_POD_LABEL
+              value: app=firefly
+            - name: FIREFLY_CONTAINER
+              value: firefly
+            - name: FIREFLY_CRON_BASE_URL
+              value: http://firefly.finance.svc.cluster.local/api/v1/cron
+            - name: FIREFLY_CRON_TIMEOUT_SEC
+              value: "30"
             - name: VAULT_NAMESPACE
               value: vault
-            - name: VAULT_K8S_AUTH_CRONJOB
-              value: vault-k8s-auth-config
-            - name: VAULT_OIDC_CRONJOB
-              value: vault-oidc-config
-            - name: VAULT_JOB_WAIT_TIMEOUT_SEC
-              value: "120"
+            - name: VAULT_ADDR
+              value: http://vault.vault.svc.cluster.local:8200
+            - name: VAULT_K8S_ROLE
+              value: vault-admin
+            - name: VAULT_K8S_ROLE_TTL
+              value: 1h
             - name: COMMS_NAMESPACE
               value: comms
-            - name: COMMS_GUEST_NAME_CRONJOB
-              value: guest-name-randomizer
-            - name: COMMS_PIN_INVITE_CRONJOB
-              value: pin-othrys-invite
-            - name: COMMS_RESET_ROOM_CRONJOB
-              value: othrys-room-reset
-            - name: COMMS_SEED_ROOM_CRONJOB
-              value: seed-othrys-room
-            - name: COMMS_JOB_WAIT_TIMEOUT_SEC
-              value: "60"
+            - name: COMMS_SYNAPSE_BASE
+              value: http://othrys-synapse-matrix-synapse:8008
+            - name: COMMS_AUTH_BASE
+              value: http://matrix-authentication-service:8080
+            - name: COMMS_MAS_ADMIN_API_BASE
+              value: http://matrix-authentication-service:8081/api/admin/v1
+            - name: COMMS_MAS_TOKEN_URL
+              value: http://matrix-authentication-service:8080/oauth2/token
+            - name: COMMS_MAS_ADMIN_CLIENT_ID
+              value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM
+            - name: COMMS_SERVER_NAME
+              value: live.bstein.dev
+            - name: COMMS_ROOM_ALIAS
+              value: "#othrys:live.bstein.dev"
+            - name: COMMS_ROOM_NAME
+              value: Othrys
+            - name: COMMS_PIN_MESSAGE
+              value: "Invite guests: share https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join and choose 'Continue' -> 'Join as guest'."
+            - name: COMMS_SEEDER_USER
+              value: othrys-seeder
+            - name: COMMS_BOT_USER
+              value: atlasbot
+            - name: COMMS_SYNAPSE_DB_HOST
+              value: postgres-service.postgres.svc.cluster.local
+            - name: COMMS_SYNAPSE_DB_PORT
+              value: "5432"
+            - name: COMMS_SYNAPSE_DB_NAME
+              value: synapse
+            - name: COMMS_SYNAPSE_DB_USER
+              value: synapse
+            - name: COMMS_TIMEOUT_SEC
+              value: "30"
+            - name: COMMS_GUEST_STALE_DAYS
+              value: "14"
             - name: VAULTWARDEN_NAMESPACE
               value: vaultwarden
             - name: VAULTWARDEN_POD_LABEL
@@ -172,10 +257,22 @@ spec:
               value: "30 4 * * *"
             - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
               value: "0 5 * * *"
+            - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
+              value: "*/5 * * * *"
+            - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
+              value: "30 4 * * *"
             - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
               value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_WGER_ADMIN
               value: "15 3 * * *"
+            - name: ARIADNE_SCHEDULE_FIREFLY_CRON
+              value: "0 3 * * *"
+            - name: ARIADNE_SCHEDULE_POD_CLEANER
+              value: "0 * * * *"
+            - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
+              value: "23 3 * * *"
+            - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
+              value: "30 4 * * 0"
             - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
               value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_VAULT_OIDC
@@ -192,6 +289,12 @@ spec:
               value: "true"
             - name: K8S_API_TIMEOUT_SEC
               value: "5"
+            - name: OPENSEARCH_URL
+              value: http://opensearch-master.logging.svc.cluster.local:9200
+            - name: OPENSEARCH_LIMIT_BYTES
+              value: "1099511627776"
+            - name: OPENSEARCH_INDEX_PATTERNS
+              value: kube-*,journald-*,trace-analytics-*
             - name: METRICS_PATH
               value: "/metrics"
           resources:
diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml
index 8d2a2a9a..e2f08c9b 100644
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@@ -6,13 +6,25 @@ metadata:
 rules:
   - apiGroups: ["batch"]
     resources:
-      - cronjobs
       - jobs
     verbs:
       - get
       - list
       - watch
       - create
+  - apiGroups: [""]
+    resources:
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+      - delete
+  - apiGroups: [""]
+    resources:
+      - pods/exec
+    verbs:
+      - create
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json
index c9c0c9ab..b76f9095 100644
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@@ -471,6 +471,119 @@
           }
         }
       ]
+    },
+    {
+      "id": 10,
+      "type": "stat",
+      "title": "Ariadne CI Coverage (%)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 22
+      },
+      "targets": [
+        {
+          "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+          "refId": "A",
+          "legendFormat": "{{branch}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 1
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 11,
+      "type": "table",
+      "title": "Ariadne CI Tests (latest)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 18,
+        "x": 6,
+        "y": 22
+      },
+      "targets": [
+        {
+          "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
     }
   ],
   "time": {
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml
index 7746f165..09c29a4c 100644
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@@ -480,6 +480,119 @@ data:
               }
             }
           ]
+        },
+        {
+          "id": 10,
+          "type": "stat",
+          "title": "Ariadne CI Coverage (%)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 0,
+            "y": 22
+          },
+          "targets": [
+            {
+              "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+              "refId": "A",
+              "legendFormat": "{{branch}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 1
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 11,
+          "type": "table",
+          "title": "Ariadne CI Tests (latest)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 18,
+            "x": 6,
+            "y": 22
+          },
+          "targets": [
+            {
+              "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
         }
       ],
       "time": {
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index a5ccb61d..c14c5ec6 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" ""
+  "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \

From c804ec040c5fcf91328e0686854aef9eba0d3e50 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 02:57:40 -0300
Subject: [PATCH 061/416] glue: centralize sync tasks in ariadne

---
 .../cert-manager/letsencrypt-prod.yaml        |   2 +-
 .../sources/cert-manager/letsencrypt.yaml     |   2 +-
 scripts/dashboards_render_atlas.py            |  35 +++++-
 services/finance/firefly-cronjob.yaml         |   1 +
 services/keycloak/deployment.yaml             |   2 +-
 services/keycloak/realm-settings-job.yaml     |  73 +++++++++++++
 .../logging/opensearch-prune-cronjob.yaml     |   1 +
 services/mailu/kustomization.yaml             |   5 -
 services/maintenance/ariadne-deployment.yaml  |  12 +-
 .../maintenance/image-sweeper-cronjob.yaml    |   1 +
 services/maintenance/pod-cleaner-cronjob.yaml |   1 +
 .../monitoring/dashboards/atlas-testing.json  | 103 ++++++++++++++++--
 .../monitoring/grafana-dashboard-testing.yaml | 103 ++++++++++++++++--
 services/nextcloud/cronjob.yaml               |   1 +
 services/nextcloud/maintenance-cronjob.yaml   |   1 +
 15 files changed, 313 insertions(+), 30 deletions(-)

diff --git a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
index 7f90f01a..5795b091 100644
--- a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
+++ b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
@@ -5,7 +5,7 @@ metadata:
   name: letsencrypt-prod
 spec:
   acme:
-    email: brad.stein@gmail.com
+    email: brad@bstein.dev
     server: https://acme-v02.api.letsencrypt.org/directory
     privateKeySecretRef:
       name: letsencrypt-prod-account-key
diff --git a/infrastructure/sources/cert-manager/letsencrypt.yaml b/infrastructure/sources/cert-manager/letsencrypt.yaml
index a988312c..5fbe4e36 100644
--- a/infrastructure/sources/cert-manager/letsencrypt.yaml
+++ b/infrastructure/sources/cert-manager/letsencrypt.yaml
@@ -5,7 +5,7 @@ metadata:
   name: letsencrypt
 spec:
   acme:
-    email: brad.stein@gmail.com
+    email: brad@bstein.dev
     server: https://acme-v02.api.letsencrypt.org/directory
     privateKeySecretRef:
       name: letsencrypt-account-key
diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index a3fb3727..509cf493 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -338,7 +338,9 @@ GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
 GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
 ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
 ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
+ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
 ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
+ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
 ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
 ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
@@ -2236,12 +2238,24 @@ def build_testing_dashboard():
             instant=True,
         )
     )
+    panels.append(
+        timeseries_panel(
+            12,
+            "Ariadne Task Runs vs Errors (1h)",
+            ARIADNE_TASK_RUNS_BY_STATUS_1H,
+            {"h": 6, "w": 24, "x": 0, "y": 12},
+            unit="none",
+            legend="{{status}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
     panels.append(
         table_panel(
             7,
             "Ariadne Task Errors (24h)",
             ARIADNE_TASK_ERRORS_24H,
-            {"h": 6, "w": 12, "x": 0, "y": 12},
+            {"h": 6, "w": 12, "x": 0, "y": 18},
             unit="none",
             transformations=sort_desc,
             instant=True,
@@ -2252,7 +2266,7 @@ def build_testing_dashboard():
             8,
             "Ariadne Schedule Last Success (hours ago)",
             ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
-            {"h": 6, "w": 12, "x": 12, "y": 12},
+            {"h": 6, "w": 12, "x": 12, "y": 18},
             unit="h",
             transformations=sort_desc,
             instant=True,
@@ -2263,18 +2277,29 @@ def build_testing_dashboard():
             9,
             "Ariadne Access Requests",
             ARIADNE_ACCESS_REQUESTS,
-            {"h": 4, "w": 24, "x": 0, "y": 18},
+            {"h": 6, "w": 12, "x": 12, "y": 24},
             unit="none",
             transformations=sort_desc,
             instant=True,
         )
     )
+    panels.append(
+        table_panel(
+            13,
+            "Ariadne Schedule Last Error (hours ago)",
+            ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
+            {"h": 6, "w": 12, "x": 0, "y": 24},
+            unit="h",
+            transformations=sort_desc,
+            instant=True,
+        )
+    )
     panels.append(
         stat_panel(
             10,
             "Ariadne CI Coverage (%)",
             ARIADNE_CI_COVERAGE,
-            {"h": 4, "w": 6, "x": 0, "y": 22},
+            {"h": 4, "w": 6, "x": 0, "y": 30},
             unit="percent",
             decimals=1,
             instant=True,
@@ -2286,7 +2311,7 @@ def build_testing_dashboard():
             11,
             "Ariadne CI Tests (latest)",
             ARIADNE_CI_TESTS,
-            {"h": 6, "w": 18, "x": 6, "y": 22},
+            {"h": 6, "w": 18, "x": 6, "y": 30},
             unit="none",
             transformations=sort_desc,
             instant=True,
diff --git a/services/finance/firefly-cronjob.yaml b/services/finance/firefly-cronjob.yaml
index 6c4d5072..9e5c8522 100644
--- a/services/finance/firefly-cronjob.yaml
+++ b/services/finance/firefly-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: finance
 spec:
   schedule: "0 3 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/keycloak/deployment.yaml b/services/keycloak/deployment.yaml
index 3d241c98..131169db 100644
--- a/services/keycloak/deployment.yaml
+++ b/services/keycloak/deployment.yaml
@@ -126,7 +126,7 @@ spec:
             - name: KC_EVENTS_LISTENERS
               value: jboss-logging,mailu-http
             - name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
-              value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
+              value: http://ariadne.maintenance.svc.cluster.local/events
           ports:
             - containerPort: 8080
               name: http
diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index fdee377c..786948be 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -469,6 +469,79 @@ spec:
                       if status not in (201, 204):
                           raise SystemExit(f"Unexpected protocol mapper create response: {status}")
 
+              # Ensure mailu_email overrides email claim for service clients.
+              excluded_email_clients = {
+                  "account",
+                  "account-console",
+                  "admin-cli",
+                  "security-admin-console",
+                  "realm-management",
+                  "broker",
+              }
+              status, clients = http_json(
+                  "GET",
+                  f"{base_url}/admin/realms/{realm}/clients",
+                  access_token,
+              )
+              if status == 200 and isinstance(clients, list):
+                  for client in clients:
+                      if not isinstance(client, dict):
+                          continue
+                      if client.get("protocol") != "openid-connect":
+                          continue
+                      client_name = client.get("clientId") if isinstance(client.get("clientId"), str) else ""
+                      if not client_name or client_name in excluded_email_clients:
+                          continue
+                      client_id = client.get("id")
+                      if not client_id:
+                          continue
+                      email_mapper = {
+                          "name": "mailu-email",
+                          "protocol": "openid-connect",
+                          "protocolMapper": "oidc-usermodel-attribute-mapper",
+                          "consentRequired": False,
+                          "config": {
+                              "user.attribute": "mailu_email",
+                              "claim.name": "email",
+                              "jsonType.label": "String",
+                              "id.token.claim": "true",
+                              "access.token.claim": "true",
+                              "userinfo.token.claim": "true",
+                              "multivalued": "false",
+                              "aggregate.attrs": "false",
+                          },
+                      }
+                      status, mappers = http_json(
+                          "GET",
+                          f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                          access_token,
+                      )
+                      existing = None
+                      if status == 200 and isinstance(mappers, list):
+                          for item in mappers:
+                              if isinstance(item, dict) and item.get("name") == email_mapper["name"]:
+                                  existing = item
+                                  break
+                      if existing and existing.get("id"):
+                          email_mapper["id"] = existing["id"]
+                          status, _ = http_json(
+                              "PUT",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing['id']}",
+                              access_token,
+                              email_mapper,
+                          )
+                          if status not in (200, 204):
+                              raise SystemExit(f"Unexpected mailu email mapper update response: {status}")
+                      else:
+                          status, _ = http_json(
+                              "POST",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                              access_token,
+                              email_mapper,
+                          )
+                          if status not in (201, 204):
+                              raise SystemExit(f"Unexpected mailu email mapper create response: {status}")
+
               # Ensure MFA is on by default for newly-created users.
               status, required_actions = http_json(
                   "GET",
diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml
index 75e72dbd..dc0dffb2 100644
--- a/services/logging/opensearch-prune-cronjob.yaml
+++ b/services/logging/opensearch-prune-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: logging
 spec:
   schedule: "23 3 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml
index 5c111eb6..7447f24a 100644
--- a/services/mailu/kustomization.yaml
+++ b/services/mailu/kustomization.yaml
@@ -15,7 +15,6 @@ resources:
   - ingressroute.yaml
   - mailu-sync-job.yaml
   - mailu-sync-cronjob.yaml
-  - mailu-sync-listener.yaml
   - front-lb.yaml
 
 configMapGenerator:
@@ -31,10 +30,6 @@ configMapGenerator:
       - sync.py=scripts/mailu_sync.py
     options:
       disableNameSuffixHash: true
-  - name: mailu-sync-listener
-    namespace: mailu-mailserver
-    files:
-      - listener.py=scripts/mailu_sync_listener.py
   - name: mailu-vault-entrypoint
     namespace: mailu-mailserver
     files:
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 57ce72b7..57862abb 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -23,6 +23,7 @@ spec:
         vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
         vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
           {{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
+          export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}"
           export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}"
           {{ end }}
           {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }}
@@ -57,6 +58,7 @@ spec:
           export SMTP_USERNAME="no-reply-portal@bstein.dev"
           export SMTP_PASSWORD="{{ .Data.data.password }}"
           export SMTP_FROM="no-reply-portal@bstein.dev"
+          export MAILU_SYSTEM_PASSWORD="{{ .Data.data.password }}"
           {{ end }}
           {{ with secret "kv/data/atlas/comms/mas-admin-client-runtime" }}
           export COMMS_MAS_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
@@ -140,7 +142,11 @@ spec:
             - name: MAILU_HOST
               value: mail.bstein.dev
             - name: MAILU_SYNC_URL
-              value: http://mailu-sync-listener.mailu-mailserver.svc.cluster.local:8080/events
+              value: http://ariadne.maintenance.svc.cluster.local/events
+            - name: MAILU_EVENT_MIN_INTERVAL_SEC
+              value: "10"
+            - name: MAILU_SYSTEM_USERS
+              value: no-reply-portal@bstein.dev,no-reply-vaultwarden@bstein.dev
             - name: MAILU_MAILBOX_WAIT_TIMEOUT_SEC
               value: "180"
             - name: MAILU_DB_HOST
@@ -263,8 +269,12 @@ spec:
               value: "30 4 * * *"
             - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
               value: "*/15 * * * *"
+            - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
+              value: "0 5 * * *"
             - name: ARIADNE_SCHEDULE_WGER_ADMIN
               value: "15 3 * * *"
+            - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
+              value: "0 6 * * *"
             - name: ARIADNE_SCHEDULE_FIREFLY_CRON
               value: "0 3 * * *"
             - name: ARIADNE_SCHEDULE_POD_CLEANER
diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml
index c94fcca6..00392060 100644
--- a/services/maintenance/image-sweeper-cronjob.yaml
+++ b/services/maintenance/image-sweeper-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: maintenance
 spec:
   schedule: "30 4 * * 0"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 2
   failedJobsHistoryLimit: 2
diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml
index e083c85f..99d13f67 100644
--- a/services/maintenance/pod-cleaner-cronjob.yaml
+++ b/services/maintenance/pod-cleaner-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: maintenance
 spec:
   schedule: "0 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json
index b76f9095..207077ef 100644
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@@ -322,6 +322,43 @@
         }
       ]
     },
+    {
+      "id": 12,
+      "type": "timeseries",
+      "title": "Ariadne Task Runs vs Errors (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 24,
+        "x": 0,
+        "y": 12
+      },
+      "targets": [
+        {
+          "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))",
+          "refId": "A",
+          "legendFormat": "{{status}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
     {
       "id": 7,
       "type": "table",
@@ -334,7 +371,7 @@
         "h": 6,
         "w": 12,
         "x": 0,
-        "y": 12
+        "y": 18
       },
       "targets": [
         {
@@ -384,7 +421,7 @@
         "h": 6,
         "w": 12,
         "x": 12,
-        "y": 12
+        "y": 18
       },
       "targets": [
         {
@@ -431,10 +468,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 4,
-        "w": 24,
-        "x": 0,
-        "y": 18
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 24
       },
       "targets": [
         {
@@ -472,6 +509,56 @@
         }
       ]
     },
+    {
+      "id": 13,
+      "type": "table",
+      "title": "Ariadne Schedule Last Error (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 24
+      },
+      "targets": [
+        {
+          "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
     {
       "id": 10,
       "type": "stat",
@@ -484,7 +571,7 @@
         "h": 4,
         "w": 6,
         "x": 0,
-        "y": 22
+        "y": 30
       },
       "targets": [
         {
@@ -547,7 +634,7 @@
         "h": 6,
         "w": 18,
         "x": 6,
-        "y": 22
+        "y": 30
       },
       "targets": [
         {
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml
index 09c29a4c..362751bb 100644
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@@ -331,6 +331,43 @@ data:
             }
           ]
         },
+        {
+          "id": 12,
+          "type": "timeseries",
+          "title": "Ariadne Task Runs vs Errors (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 24,
+            "x": 0,
+            "y": 12
+          },
+          "targets": [
+            {
+              "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))",
+              "refId": "A",
+              "legendFormat": "{{status}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
         {
           "id": 7,
           "type": "table",
@@ -343,7 +380,7 @@ data:
             "h": 6,
             "w": 12,
             "x": 0,
-            "y": 12
+            "y": 18
           },
           "targets": [
             {
@@ -393,7 +430,7 @@ data:
             "h": 6,
             "w": 12,
             "x": 12,
-            "y": 12
+            "y": 18
           },
           "targets": [
             {
@@ -440,10 +477,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 4,
-            "w": 24,
-            "x": 0,
-            "y": 18
+            "h": 6,
+            "w": 12,
+            "x": 12,
+            "y": 24
           },
           "targets": [
             {
@@ -481,6 +518,56 @@ data:
             }
           ]
         },
+        {
+          "id": 13,
+          "type": "table",
+          "title": "Ariadne Schedule Last Error (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 12,
+            "x": 0,
+            "y": 24
+          },
+          "targets": [
+            {
+              "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
         {
           "id": 10,
           "type": "stat",
@@ -493,7 +580,7 @@ data:
             "h": 4,
             "w": 6,
             "x": 0,
-            "y": 22
+            "y": 30
           },
           "targets": [
             {
@@ -556,7 +643,7 @@ data:
             "h": 6,
             "w": 18,
             "x": 6,
-            "y": 22
+            "y": 30
           },
           "targets": [
             {
diff --git a/services/nextcloud/cronjob.yaml b/services/nextcloud/cronjob.yaml
index cc0091bc..58d8aa1b 100644
--- a/services/nextcloud/cronjob.yaml
+++ b/services/nextcloud/cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: nextcloud
 spec:
   schedule: "*/5 * * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   jobTemplate:
     spec:
diff --git a/services/nextcloud/maintenance-cronjob.yaml b/services/nextcloud/maintenance-cronjob.yaml
index d4008c7c..177cc022 100644
--- a/services/nextcloud/maintenance-cronjob.yaml
+++ b/services/nextcloud/maintenance-cronjob.yaml
@@ -6,6 +6,7 @@ metadata:
   namespace: nextcloud
 spec:
   schedule: "30 4 * * *"
+  suspend: true
   concurrencyPolicy: Forbid
   jobTemplate:
     spec:

From 18a086ce9528ae664f967ac65718cfceecacf986 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 03:03:32 -0300
Subject: [PATCH 062/416] keycloak: bump realm settings job name

---
 services/keycloak/realm-settings-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index 786948be..6e6589de 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-33
+  name: keycloak-realm-settings-34
   namespace: sso
 spec:
   backoffLimit: 0

From ec36cd21e3a36f01880161570d11903496c5e694 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 03:05:53 -0300
Subject: [PATCH 063/416] rbac: allow ariadne to read cronjobs

---
 services/maintenance/ariadne-rbac.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml
index e2f08c9b..8a063bf3 100644
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@@ -7,6 +7,7 @@ rules:
   - apiGroups: ["batch"]
     resources:
       - jobs
+      - cronjobs
     verbs:
       - get
       - list

From 0680926dae1c938121c2f401e9b748dddeadad90 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 03:21:01 -0300
Subject: [PATCH 064/416] vault: allow ariadne to read needed secrets

---
 services/vault/scripts/vault_k8s_auth_configure.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index c14c5ec6..2fce3f4e 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret shared/harbor-pull" ""
+  "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \

From 0ab34c0af5f4c0bd66686e738fb8166e7faafbe4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 03:39:17 -0300
Subject: [PATCH 065/416] ariadne: split portal and ariadne db secrets

---
 services/maintenance/ariadne-deployment.yaml       | 4 +++-
 services/vault/scripts/vault_k8s_auth_configure.sh | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 57862abb..bb9766f9 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -24,7 +24,9 @@ spec:
         vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
           {{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
           export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}"
-          export PORTAL_DATABASE_URL="{{ .Data.data.database_url }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
           {{ end }}
           {{ with secret "kv/data/atlas/portal/bstein-dev-home-keycloak-admin" }}
           export KEYCLOAK_ADMIN_CLIENT_SECRET="{{ .Data.data.client_secret }}"
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index 2fce3f4e..bc03cf4c 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "maintenance/ariadne-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" ""
+  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \

From f7f549e536d46736bb200a9f91132213459deeec Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 03:53:34 -0300
Subject: [PATCH 066/416] maintenance: bump ariadne image tag

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index a86453e1..fd544410 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-6 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-10 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From 3d807570568c6f1314bb171821ad701098798ddd Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 04:05:41 -0300
Subject: [PATCH 067/416] maintenance: fix ariadne comms endpoints and exec
 RBAC

---
 services/maintenance/ariadne-deployment.yaml | 8 ++++----
 services/maintenance/ariadne-rbac.yaml       | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index bb9766f9..069f3885 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -202,13 +202,13 @@ spec:
             - name: COMMS_NAMESPACE
               value: comms
             - name: COMMS_SYNAPSE_BASE
-              value: http://othrys-synapse-matrix-synapse:8008
+              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
             - name: COMMS_AUTH_BASE
-              value: http://matrix-authentication-service:8080
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8080
             - name: COMMS_MAS_ADMIN_API_BASE
-              value: http://matrix-authentication-service:8081/api/admin/v1
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8081/api/admin/v1
             - name: COMMS_MAS_TOKEN_URL
-              value: http://matrix-authentication-service:8080/oauth2/token
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8080/oauth2/token
             - name: COMMS_MAS_ADMIN_CLIENT_ID
               value: 01KDXMVQBQ5JNY6SEJPZW6Z8BM
             - name: COMMS_SERVER_NAME
diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml
index 8a063bf3..88689cb6 100644
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@@ -25,6 +25,7 @@ rules:
     resources:
       - pods/exec
     verbs:
+      - get
       - create
 
 ---

From 152c660b0a73e5c4465aaf8842d2476025af6b13 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 05:03:26 -0300
Subject: [PATCH 068/416] maintenance: bump ariadne image tag

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index fd544410..a53ffee5 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-10 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script

From 94ad57e5a56de031f586e24af0f494e5bc9f5660 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 10:20:53 -0300
Subject: [PATCH 069/416] flux: align imagepolicy tag setters

---
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 services/maintenance/kustomization.yaml     | 2 +-
 services/pegasus/deployment.yaml            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index ec137dc6..26840ab7 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,9 +20,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
+    newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+    newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index a53ffee5..daee5f14 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
 
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne"}
+    newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne:tag"}
 
 configMapGenerator:
   - name: disable-k3s-traefik-script
diff --git a/services/pegasus/deployment.yaml b/services/pegasus/deployment.yaml
index bc3db70a..b6a1639e 100644
--- a/services/pegasus/deployment.yaml
+++ b/services/pegasus/deployment.yaml
@@ -72,7 +72,7 @@ spec:
 
       containers:
         - name: pegasus
-          image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus"}
+          image: registry.bstein.dev/streaming/pegasus-vault:1.2.32 # {"$imagepolicy": "jellyfin:pegasus:tag"}
           imagePullPolicy: Always
           env:
             - name: PEGASUS_MEDIA_ROOT

From 6a0872259b9f063e994829ea93a41af9f5beae0d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 10:33:06 -0300
Subject: [PATCH 070/416] flux: align image automation namespaces

---
 .../applications/bstein-dev-home/image-automation.yaml          | 2 +-
 .../flux-system/applications/pegasus/image-automation.yaml      | 2 +-
 .../flux-system/platform/maintenance/image-automation.yaml      | 2 +-
 services/pegasus/kustomization.yaml                             | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
index 643d4792..10d79132 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
   name: bstein-dev-home
-  namespace: flux-system
+  namespace: bstein-dev-home
 spec:
   interval: 1m0s
   sourceRef:
diff --git a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
index ec0494e5..d11422a8 100644
--- a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
@@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
   name: pegasus
-  namespace: flux-system
+  namespace: jellyfin
 spec:
   interval: 1m0s
   sourceRef:
diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
index 867cae48..9f3214b5 100644
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
   name: maintenance
-  namespace: flux-system
+  namespace: maintenance
 spec:
   interval: 1m0s
   sourceRef:
diff --git a/services/pegasus/kustomization.yaml b/services/pegasus/kustomization.yaml
index bef2b405..05c3baa5 100644
--- a/services/pegasus/kustomization.yaml
+++ b/services/pegasus/kustomization.yaml
@@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - configmap.yaml
+  - image.yaml
   - vault-serviceaccount.yaml
   - secretproviderclass.yaml
   - service.yaml

From d033d680a31f2b8215a508e6f5384365a8a175be Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 10:34:25 -0300
Subject: [PATCH 071/416] flux: fix image automation templates

---
 .../applications/bstein-dev-home/image-automation.yaml          | 2 +-
 .../flux-system/platform/maintenance/image-automation.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
index 10d79132..8b2900c1 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@@ -18,7 +18,7 @@ spec:
       author:
         email: ops@bstein.dev
         name: flux-bot
-      messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
+      messageTemplate: "chore(bstein-dev-home): update images to {{range .Changed.Images}}{{.}}{{end}}"
     push:
       branch: feature/ariadne
   update:
diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
index 9f3214b5..48e4c309 100644
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@@ -18,7 +18,7 @@ spec:
       author:
         email: ops@bstein.dev
         name: flux-bot
-      messageTemplate: "chore(maintenance): update images to {{range .Updated.Images}}{{.}}{{end}}"
+      messageTemplate: "chore(maintenance): update images to {{range .Changed.Images}}{{.}}{{end}}"
     push:
       branch: feature/ariadne
   update:

From 1aadaf59ffadd820599c12e9f7fca126151b5f3d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 10:35:29 -0300
Subject: [PATCH 072/416] flux: simplify image automation messages

---
 .../applications/bstein-dev-home/image-automation.yaml          | 2 +-
 .../flux-system/platform/maintenance/image-automation.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
index 8b2900c1..f1d41be3 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@@ -18,7 +18,7 @@ spec:
       author:
         email: ops@bstein.dev
         name: flux-bot
-      messageTemplate: "chore(bstein-dev-home): update images to {{range .Changed.Images}}{{.}}{{end}}"
+      messageTemplate: "chore(bstein-dev-home): automated image update"
     push:
       branch: feature/ariadne
   update:
diff --git a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
index 48e4c309..6e8f612c 100644
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@@ -18,7 +18,7 @@ spec:
       author:
         email: ops@bstein.dev
         name: flux-bot
-      messageTemplate: "chore(maintenance): update images to {{range .Changed.Images}}{{.}}{{end}}"
+      messageTemplate: "chore(maintenance): automated image update"
     push:
       branch: feature/ariadne
   update:

From 0ed261a5df5ee30cf54e248eb3b56df20f3df7a5 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 13:35:55 +0000
Subject: [PATCH 073/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index daee5f14..088ce488 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -23,11 +23,9 @@ resources:
   - node-image-sweeper-serviceaccount.yaml
   - node-image-sweeper-daemonset.yaml
   - image-sweeper-cronjob.yaml
-
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-13 # {"$imagepolicy": "maintenance:ariadne:tag"}
-
+    newTag: 0.1.0-15 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 3ffb1b8a209e3e5d0891866c8eaeeee87c5493ff Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 13:36:39 +0000
Subject: [PATCH 074/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 26840ab7..78f5e685 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,9 +20,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-106 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-107 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From b05a76eb0795dffffce2a2cf99041340a3699be8 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 14:04:54 +0000
Subject: [PATCH 075/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 088ce488..05f3be2f 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-15 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-16 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From aaeb933625e707675a249814aaa17a2999a8c63b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 11:29:29 -0300
Subject: [PATCH 076/416] monitoring: refresh testing dashboard

---
 .gitignore                                    |   1 +
 scripts/dashboards_render_atlas.py            | 200 ++--
 .../monitoring/dashboards/atlas-testing.json  | 896 ++++++++++++------
 .../monitoring/grafana-dashboard-testing.yaml | 896 ++++++++++++------
 4 files changed, 1401 insertions(+), 592 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8d0ab1e9..7543bbfb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ __pycache__/
 *.py[cod]
 .pytest_cache
 .venv
+.venv-ci
 tmp/
diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 509cf493..6eaafb46 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -339,6 +339,9 @@ GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
 ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
 ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
 ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
+ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
+ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
+ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
 ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
 ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
@@ -696,8 +699,10 @@ def bargauge_panel(
     grid,
     *,
     unit="none",
+    legend=None,
     links=None,
     limit=None,
+    sort_order="desc",
     thresholds=None,
     decimals=None,
     instant=False,
@@ -710,7 +715,12 @@ def bargauge_panel(
         "datasource": PROM_DS,
         "gridPos": grid,
         "targets": [
-            {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
+            {
+                "expr": expr,
+                "refId": "A",
+                "legendFormat": legend or "{{node}}",
+                **({"instant": True} if instant else {}),
+            }
         ],
         "fieldConfig": {
             "defaults": {
@@ -748,7 +758,7 @@ def bargauge_panel(
     panel["transformations"] = [
         {
             "id": "sortBy",
-            "options": {"fields": ["Value"], "order": "desc"},
+            "options": {"fields": ["Value"], "order": sort_order},
         }
     ]
     if limit:
@@ -2163,7 +2173,24 @@ def build_mail_dashboard():
 
 def build_testing_dashboard():
     panels = []
-    sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}]
+    age_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 6},
+            {"color": "orange", "value": 24},
+            {"color": "red", "value": 48},
+        ],
+    }
+    recent_error_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "red", "value": None},
+            {"color": "orange", "value": 1},
+            {"color": "yellow", "value": 6},
+            {"color": "green", "value": 24},
+        ],
+    }
 
     panels.append(
         stat_panel(
@@ -2184,66 +2211,56 @@ def build_testing_dashboard():
         )
     )
     panels.append(
-        table_panel(
+        stat_panel(
             2,
             "Glue Jobs Missing Success",
-            GLUE_MISSING_ACTIVE,
-            {"h": 4, "w": 6, "x": 6, "y": 0},
+            GLUE_MISSING_COUNT,
+            {"h": 4, "w": 4, "x": 4, "y": 0},
             unit="none",
-            transformations=sort_desc,
-            instant=True,
         )
     )
     panels.append(
-        table_panel(
+        stat_panel(
             3,
             "Glue Jobs Suspended",
-            GLUE_SUSPENDED,
-            {"h": 4, "w": 6, "x": 12, "y": 0},
+            GLUE_SUSPENDED_COUNT,
+            {"h": 4, "w": 4, "x": 8, "y": 0},
             unit="none",
-            transformations=sort_desc,
-            instant=True,
         )
     )
     panels.append(
-        table_panel(
+        stat_panel(
             4,
-            "Glue Jobs Active Runs",
-            GLUE_ACTIVE,
-            {"h": 4, "w": 6, "x": 18, "y": 0},
+            "Ariadne Task Errors (1h)",
+            ARIADNE_TASK_ERRORS_1H_TOTAL,
+            {"h": 4, "w": 4, "x": 12, "y": 0},
             unit="none",
-            transformations=sort_desc,
-            instant=True,
         )
     )
     panels.append(
-        table_panel(
+        stat_panel(
             5,
-            "Glue Jobs Last Success (hours ago)",
-            GLUE_LAST_SUCCESS_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 0, "y": 4},
-            unit="h",
-            transformations=sort_desc,
-            instant=True,
+            "Ariadne Task Errors (24h)",
+            ARIADNE_TASK_ERRORS_24H_TOTAL,
+            {"h": 4, "w": 4, "x": 16, "y": 0},
+            unit="none",
         )
     )
     panels.append(
-        table_panel(
+        stat_panel(
             6,
-            "Glue Jobs Last Schedule (hours ago)",
-            GLUE_LAST_SCHEDULE_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 12, "y": 4},
-            unit="h",
-            transformations=sort_desc,
-            instant=True,
+            "Ariadne Task Runs (1h)",
+            ARIADNE_TASK_RUNS_1H_TOTAL,
+            {"h": 4, "w": 4, "x": 20, "y": 0},
+            unit="none",
         )
     )
     panels.append(
         timeseries_panel(
-            12,
+            7,
             "Ariadne Task Runs vs Errors (1h)",
             ARIADNE_TASK_RUNS_BY_STATUS_1H,
-            {"h": 6, "w": 24, "x": 0, "y": 12},
+            {"h": 6, "w": 24, "x": 0, "y": 4},
             unit="none",
             legend="{{status}}",
             legend_display="table",
@@ -2251,55 +2268,110 @@ def build_testing_dashboard():
         )
     )
     panels.append(
-        table_panel(
-            7,
+        bargauge_panel(
+            8,
             "Ariadne Task Errors (24h)",
             ARIADNE_TASK_ERRORS_24H,
-            {"h": 6, "w": 12, "x": 0, "y": 18},
+            {"h": 8, "w": 12, "x": 0, "y": 10},
             unit="none",
-            transformations=sort_desc,
             instant=True,
+            legend="{{task}}",
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "yellow", "value": 1},
+                    {"color": "orange", "value": 3},
+                    {"color": "red", "value": 5},
+                ],
+            },
         )
     )
     panels.append(
-        table_panel(
-            8,
-            "Ariadne Schedule Last Success (hours ago)",
-            ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
-            {"h": 6, "w": 12, "x": 12, "y": 18},
-            unit="h",
-            transformations=sort_desc,
-            instant=True,
-        )
-    )
-    panels.append(
-        table_panel(
+        bargauge_panel(
             9,
-            "Ariadne Access Requests",
-            ARIADNE_ACCESS_REQUESTS,
-            {"h": 6, "w": 12, "x": 12, "y": 24},
+            "Ariadne Task Success (24h)",
+            ARIADNE_TASK_SUCCESS_24H,
+            {"h": 8, "w": 12, "x": 12, "y": 10},
             unit="none",
-            transformations=sort_desc,
             instant=True,
+            legend="{{task}}",
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "red", "value": None},
+                    {"color": "orange", "value": 1},
+                    {"color": "yellow", "value": 5},
+                    {"color": "green", "value": 10},
+                ],
+            },
         )
     )
     panels.append(
-        table_panel(
-            13,
+        bargauge_panel(
+            10,
             "Ariadne Schedule Last Error (hours ago)",
             ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
-            {"h": 6, "w": 12, "x": 0, "y": 24},
+            {"h": 8, "w": 12, "x": 0, "y": 18},
             unit="h",
-            transformations=sort_desc,
             instant=True,
+            legend="{{task}}",
+            thresholds=recent_error_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            11,
+            "Ariadne Schedule Last Success (hours ago)",
+            ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
+            {"h": 8, "w": 12, "x": 12, "y": 18},
+            unit="h",
+            instant=True,
+            legend="{{task}}",
+            thresholds=age_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            12,
+            "Glue Jobs Last Success (hours ago)",
+            GLUE_LAST_SUCCESS_AGE_HOURS,
+            {"h": 8, "w": 12, "x": 0, "y": 26},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{cronjob}}",
+            thresholds=age_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            13,
+            "Glue Jobs Last Schedule (hours ago)",
+            GLUE_LAST_SCHEDULE_AGE_HOURS,
+            {"h": 8, "w": 12, "x": 12, "y": 26},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{cronjob}}",
+            thresholds=age_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            14,
+            "Ariadne Access Requests",
+            ARIADNE_ACCESS_REQUESTS,
+            {"h": 6, "w": 8, "x": 0, "y": 34},
+            unit="none",
+            instant=True,
+            legend="{{status}}",
         )
     )
     panels.append(
         stat_panel(
-            10,
+            15,
             "Ariadne CI Coverage (%)",
             ARIADNE_CI_COVERAGE,
-            {"h": 4, "w": 6, "x": 0, "y": 30},
+            {"h": 6, "w": 4, "x": 8, "y": 34},
             unit="percent",
             decimals=1,
             instant=True,
@@ -2308,12 +2380,12 @@ def build_testing_dashboard():
     )
     panels.append(
         table_panel(
-            11,
+            16,
             "Ariadne CI Tests (latest)",
             ARIADNE_CI_TESTS,
-            {"h": 6, "w": 18, "x": 6, "y": 30},
+            {"h": 6, "w": 12, "x": 12, "y": 34},
             unit="none",
-            transformations=sort_desc,
+            transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
             instant=True,
         )
     )
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json
index 207077ef..420abf26 100644
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@@ -74,7 +74,7 @@
     },
     {
       "id": 2,
-      "type": "table",
+      "type": "stat",
       "title": "Glue Jobs Missing Success",
       "datasource": {
         "type": "prometheus",
@@ -82,49 +82,59 @@
       },
       "gridPos": {
         "h": 4,
-        "w": 6,
-        "x": 6,
+        "w": 4,
+        "x": 4,
         "y": 0
       },
       "targets": [
         {
-          "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
-          "refId": "A",
-          "instant": true
+          "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
           "unit": "none",
           "custom": {
-            "filterable": true
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
         },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
+        "textMode": "value"
+      }
     },
     {
       "id": 3,
-      "type": "table",
+      "type": "stat",
       "title": "Glue Jobs Suspended",
       "datasource": {
         "type": "prometheus",
@@ -132,198 +142,238 @@
       },
       "gridPos": {
         "h": 4,
-        "w": 6,
-        "x": 12,
+        "w": 4,
+        "x": 8,
         "y": 0
       },
       "targets": [
         {
-          "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
-          "refId": "A",
-          "instant": true
+          "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
           "unit": "none",
           "custom": {
-            "filterable": true
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
         },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
+        "textMode": "value"
+      }
     },
     {
       "id": 4,
-      "type": "table",
-      "title": "Glue Jobs Active Runs",
+      "type": "stat",
+      "title": "Ariadne Task Errors (1h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 4,
-        "w": 6,
-        "x": 18,
+        "w": 4,
+        "x": 12,
         "y": 0
       },
       "targets": [
         {
-          "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
-          "refId": "A",
-          "instant": true
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
           "unit": "none",
           "custom": {
-            "filterable": true
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
         },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
+        "textMode": "value"
+      }
     },
     {
       "id": 5,
-      "type": "table",
-      "title": "Glue Jobs Last Success (hours ago)",
+      "type": "stat",
+      "title": "Ariadne Task Errors (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 4
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 0
       },
       "targets": [
         {
-          "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-          "refId": "A",
-          "instant": true
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "h",
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
           "custom": {
-            "filterable": true
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
         },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
+        "textMode": "value"
+      }
     },
     {
       "id": 6,
-      "type": "table",
-      "title": "Glue Jobs Last Schedule (hours ago)",
+      "type": "stat",
+      "title": "Ariadne Task Runs (1h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 4
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 0
       },
       "targets": [
         {
-          "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-          "refId": "A",
-          "instant": true
+          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+          "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "h",
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
           "custom": {
-            "filterable": true
+            "displayMode": "auto"
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
         },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
+        "textMode": "value"
+      }
     },
     {
-      "id": 12,
+      "id": 7,
       "type": "timeseries",
       "title": "Ariadne Task Runs vs Errors (1h)",
       "datasource": {
@@ -334,7 +384,7 @@
         "h": 6,
         "w": 24,
         "x": 0,
-        "y": 12
+        "y": 4
       },
       "targets": [
         {
@@ -360,94 +410,68 @@
       }
     },
     {
-      "id": 7,
-      "type": "table",
+      "id": 8,
+      "type": "bargauge",
       "title": "Ariadne Task Errors (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 8,
         "w": 12,
         "x": 0,
-        "y": 18
+        "y": 10
       },
       "targets": [
         {
           "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
           "refId": "A",
+          "legendFormat": "{{task}}",
           "instant": true
         }
       ],
       "fieldConfig": {
         "defaults": {
           "unit": "none",
-          "custom": {
-            "filterable": true
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
       },
       "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 8,
-      "type": "table",
-      "title": "Ariadne Schedule Last Success (hours ago)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 12,
-        "x": 12,
-        "y": 18
-      },
-      "targets": [
-        {
-          "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "h",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
         {
           "id": "sortBy",
           "options": {
@@ -461,93 +485,67 @@
     },
     {
       "id": 9,
-      "type": "table",
-      "title": "Ariadne Access Requests",
+      "type": "bargauge",
+      "title": "Ariadne Task Success (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 8,
         "w": 12,
         "x": 12,
-        "y": 24
+        "y": 10
       },
       "targets": [
         {
-          "expr": "ariadne_access_requests_total",
+          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))",
           "refId": "A",
+          "legendFormat": "{{task}}",
           "instant": true
         }
       ],
       "fieldConfig": {
         "defaults": {
           "unit": "none",
-          "custom": {
-            "filterable": true
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 1
+              },
+              {
+                "color": "yellow",
+                "value": 5
+              },
+              {
+                "color": "green",
+                "value": 10
+              }
+            ]
           }
         },
         "overrides": []
       },
       "options": {
-        "showHeader": true,
-        "columnFilters": false
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
       },
       "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 13,
-      "type": "table",
-      "title": "Ariadne Schedule Last Error (hours ago)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 12,
-        "x": 0,
-        "y": 24
-      },
-      "targets": [
-        {
-          "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
-          "refId": "A",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "h",
-          "custom": {
-            "filterable": true
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "columnFilters": false
-      },
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        },
         {
           "id": "sortBy",
           "options": {
@@ -561,6 +559,376 @@
     },
     {
       "id": 10,
+      "type": "bargauge",
+      "title": "Ariadne Schedule Last Error (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 18
+      },
+      "targets": [
+        {
+          "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 1
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "green",
+                "value": 24
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 11,
+      "type": "bargauge",
+      "title": "Ariadne Schedule Last Success (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 18
+      },
+      "targets": [
+        {
+          "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 12,
+      "type": "bargauge",
+      "title": "Glue Jobs Last Success (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 26
+      },
+      "targets": [
+        {
+          "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{cronjob}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 13,
+      "type": "bargauge",
+      "title": "Glue Jobs Last Schedule (hours ago)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 26
+      },
+      "targets": [
+        {
+          "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{cronjob}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 14,
+      "type": "bargauge",
+      "title": "Ariadne Access Requests",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 0,
+        "y": 34
+      },
+      "targets": [
+        {
+          "expr": "ariadne_access_requests_total",
+          "refId": "A",
+          "legendFormat": "{{status}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "orange",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 15,
       "type": "stat",
       "title": "Ariadne CI Coverage (%)",
       "datasource": {
@@ -568,10 +936,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 0,
-        "y": 30
+        "h": 6,
+        "w": 4,
+        "x": 8,
+        "y": 34
       },
       "targets": [
         {
@@ -623,7 +991,7 @@
       }
     },
     {
-      "id": 11,
+      "id": 16,
       "type": "table",
       "title": "Ariadne CI Tests (latest)",
       "datasource": {
@@ -632,9 +1000,9 @@
       },
       "gridPos": {
         "h": 6,
-        "w": 18,
-        "x": 6,
-        "y": 30
+        "w": 12,
+        "x": 12,
+        "y": 34
       },
       "targets": [
         {
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml
index 362751bb..52b28367 100644
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@@ -83,7 +83,7 @@ data:
         },
         {
           "id": 2,
-          "type": "table",
+          "type": "stat",
           "title": "Glue Jobs Missing Success",
           "datasource": {
             "type": "prometheus",
@@ -91,49 +91,59 @@ data:
           },
           "gridPos": {
             "h": 4,
-            "w": 6,
-            "x": 6,
+            "w": 4,
+            "x": 4,
             "y": 0
           },
           "targets": [
             {
-              "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
-              "refId": "A",
-              "instant": true
+              "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
               "unit": "none",
               "custom": {
-                "filterable": true
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
             },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
+            "textMode": "value"
+          }
         },
         {
           "id": 3,
-          "type": "table",
+          "type": "stat",
           "title": "Glue Jobs Suspended",
           "datasource": {
             "type": "prometheus",
@@ -141,198 +151,238 @@ data:
           },
           "gridPos": {
             "h": 4,
-            "w": 6,
-            "x": 12,
+            "w": 4,
+            "x": 8,
             "y": 0
           },
           "targets": [
             {
-              "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
-              "refId": "A",
-              "instant": true
+              "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
               "unit": "none",
               "custom": {
-                "filterable": true
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
             },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
+            "textMode": "value"
+          }
         },
         {
           "id": 4,
-          "type": "table",
-          "title": "Glue Jobs Active Runs",
+          "type": "stat",
+          "title": "Ariadne Task Errors (1h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 4,
-            "w": 6,
-            "x": 18,
+            "w": 4,
+            "x": 12,
             "y": 0
           },
           "targets": [
             {
-              "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
-              "refId": "A",
-              "instant": true
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
               "unit": "none",
               "custom": {
-                "filterable": true
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
             },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
+            "textMode": "value"
+          }
         },
         {
           "id": 5,
-          "type": "table",
-          "title": "Glue Jobs Last Success (hours ago)",
+          "type": "stat",
+          "title": "Ariadne Task Errors (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 0,
-            "y": 4
+            "h": 4,
+            "w": 4,
+            "x": 16,
+            "y": 0
           },
           "targets": [
             {
-              "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-              "refId": "A",
-              "instant": true
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "h",
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
               "custom": {
-                "filterable": true
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
             },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
+            "textMode": "value"
+          }
         },
         {
           "id": 6,
-          "type": "table",
-          "title": "Glue Jobs Last Schedule (hours ago)",
+          "type": "stat",
+          "title": "Ariadne Task Runs (1h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
-            "w": 12,
-            "x": 12,
-            "y": 4
+            "h": 4,
+            "w": 4,
+            "x": 20,
+            "y": 0
           },
           "targets": [
             {
-              "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
-              "refId": "A",
-              "instant": true
+              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+              "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "h",
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
               "custom": {
-                "filterable": true
+                "displayMode": "auto"
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
             },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
+            "textMode": "value"
+          }
         },
         {
-          "id": 12,
+          "id": 7,
           "type": "timeseries",
           "title": "Ariadne Task Runs vs Errors (1h)",
           "datasource": {
@@ -343,7 +393,7 @@ data:
             "h": 6,
             "w": 24,
             "x": 0,
-            "y": 12
+            "y": 4
           },
           "targets": [
             {
@@ -369,94 +419,68 @@ data:
           }
         },
         {
-          "id": 7,
-          "type": "table",
+          "id": 8,
+          "type": "bargauge",
           "title": "Ariadne Task Errors (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 8,
             "w": 12,
             "x": 0,
-            "y": 18
+            "y": 10
           },
           "targets": [
             {
               "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
               "refId": "A",
+              "legendFormat": "{{task}}",
               "instant": true
             }
           ],
           "fieldConfig": {
             "defaults": {
               "unit": "none",
-              "custom": {
-                "filterable": true
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
           },
           "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 8,
-          "type": "table",
-          "title": "Ariadne Schedule Last Success (hours ago)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 6,
-            "w": 12,
-            "x": 12,
-            "y": 18
-          },
-          "targets": [
-            {
-              "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "h",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
             {
               "id": "sortBy",
               "options": {
@@ -470,93 +494,67 @@ data:
         },
         {
           "id": 9,
-          "type": "table",
-          "title": "Ariadne Access Requests",
+          "type": "bargauge",
+          "title": "Ariadne Task Success (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 8,
             "w": 12,
             "x": 12,
-            "y": 24
+            "y": 10
           },
           "targets": [
             {
-              "expr": "ariadne_access_requests_total",
+              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))",
               "refId": "A",
+              "legendFormat": "{{task}}",
               "instant": true
             }
           ],
           "fieldConfig": {
             "defaults": {
               "unit": "none",
-              "custom": {
-                "filterable": true
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 1
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 5
+                  },
+                  {
+                    "color": "green",
+                    "value": 10
+                  }
+                ]
               }
             },
             "overrides": []
           },
           "options": {
-            "showHeader": true,
-            "columnFilters": false
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
           },
           "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 13,
-          "type": "table",
-          "title": "Ariadne Schedule Last Error (hours ago)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 6,
-            "w": 12,
-            "x": 0,
-            "y": 24
-          },
-          "targets": [
-            {
-              "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
-              "refId": "A",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "h",
-              "custom": {
-                "filterable": true
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "showHeader": true,
-            "columnFilters": false
-          },
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            },
             {
               "id": "sortBy",
               "options": {
@@ -570,6 +568,376 @@ data:
         },
         {
           "id": 10,
+          "type": "bargauge",
+          "title": "Ariadne Schedule Last Error (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 18
+          },
+          "targets": [
+            {
+              "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 1
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "green",
+                    "value": 24
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 11,
+          "type": "bargauge",
+          "title": "Ariadne Schedule Last Success (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 18
+          },
+          "targets": [
+            {
+              "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 12,
+          "type": "bargauge",
+          "title": "Glue Jobs Last Success (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 26
+          },
+          "targets": [
+            {
+              "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{cronjob}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 13,
+          "type": "bargauge",
+          "title": "Glue Jobs Last Schedule (hours ago)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 26
+          },
+          "targets": [
+            {
+              "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{cronjob}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 14,
+          "type": "bargauge",
+          "title": "Ariadne Access Requests",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 8,
+            "x": 0,
+            "y": 34
+          },
+          "targets": [
+            {
+              "expr": "ariadne_access_requests_total",
+              "refId": "A",
+              "legendFormat": "{{status}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "orange",
+                    "value": 70
+                  },
+                  {
+                    "color": "red",
+                    "value": 85
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 15,
           "type": "stat",
           "title": "Ariadne CI Coverage (%)",
           "datasource": {
@@ -577,10 +945,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 4,
-            "w": 6,
-            "x": 0,
-            "y": 30
+            "h": 6,
+            "w": 4,
+            "x": 8,
+            "y": 34
           },
           "targets": [
             {
@@ -632,7 +1000,7 @@ data:
           }
         },
         {
-          "id": 11,
+          "id": 16,
           "type": "table",
           "title": "Ariadne CI Tests (latest)",
           "datasource": {
@@ -641,9 +1009,9 @@ data:
           },
           "gridPos": {
             "h": 6,
-            "w": 18,
-            "x": 6,
-            "y": 30
+            "w": 12,
+            "x": 12,
+            "y": 34
           },
           "targets": [
             {

From 5fe70b14710fb08c0b1e352916e6f63016b5ea19 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 11:45:11 -0300
Subject: [PATCH 077/416] grafana: allow email-based oauth user lookup

---
 services/monitoring/helmrelease.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 304de05c..02bc4821 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -354,6 +354,8 @@ spec:
       GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH: "contains(groups, 'admin') && 'Admin' || 'Viewer'"
       GF_AUTH_GENERIC_OAUTH_USE_PKCE: "true"
       GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: "false"
+      GF_AUTH_GENERIC_OAUTH_ALLOW_INSECURE_EMAIL_LOOKUP: "true"
+      GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH: "email"
       GF_AUTH_SIGNOUT_REDIRECT_URL: "https://sso.bstein.dev/realms/atlas/protocol/openid-connect/logout?redirect_uri=https://metrics.bstein.dev/"
     grafana.ini:
       server:

From d9630011044cadf3c9582b37647258e36c6b7814 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:08:23 -0300
Subject: [PATCH 078/416] monitoring: add grafana user dedupe job

---
 .../monitoring/grafana-user-dedupe-job.yaml   | 51 +++++++++++++++++++
 services/monitoring/kustomization.yaml        |  1 +
 2 files changed, 52 insertions(+)
 create mode 100644 services/monitoring/grafana-user-dedupe-job.yaml

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
new file mode 100644
index 00000000..b633a19d
--- /dev/null
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -0,0 +1,51 @@
+# services/monitoring/grafana-user-dedupe-job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: grafana-user-dedupe
+  namespace: monitoring
+spec:
+  backoffLimit: 1
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: dedupe
+          image: alpine:3.20
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - |
+              set -euo pipefail
+              apk add --no-cache sqlite
+              db="/var/lib/grafana/grafana.db"
+              if [ ! -f "$db" ]; then
+                echo "grafana db not found at $db"
+                exit 1
+              fi
+              if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then
+                echo "GRAFANA_DEDUPE_EMAILS is required"
+                exit 1
+              fi
+              for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do
+                ids="$(sqlite3 "$db" "select id from user where email = '${email}';")"
+                if [ -z "$ids" ]; then
+                  echo "no grafana user found for ${email}"
+                  continue
+                fi
+                echo "deleting grafana users with ids: ${ids}"
+                sqlite3 "$db" "delete from user_auth where user_id in (${ids});"
+                sqlite3 "$db" "delete from user where id in (${ids});"
+              done
+              echo "done"
+          env:
+            - name: GRAFANA_DEDUPE_EMAILS
+              value: brad.stein@gmail.com,brad@bstein.dev
+          volumeMounts:
+            - name: grafana-storage
+              mountPath: /var/lib/grafana
+      volumes:
+        - name: grafana-storage
+          persistentVolumeClaim:
+            claimName: grafana
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 7d0b01b8..86ab8269 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -24,6 +24,7 @@ resources:
   - grafana-folders.yaml
   - helmrelease.yaml
   - grafana-org-bootstrap.yaml
+  - grafana-user-dedupe-job.yaml
 
 configMapGenerator:
   - name: postmark-exporter-script

From af789c0d0bdb172917c7f206c1ccfb0e28f32639 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:11:28 -0300
Subject: [PATCH 079/416] monitoring: dedupe grafana user via api

---
 .../monitoring/grafana-user-dedupe-job.yaml   | 44 ++++++++++++-------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index b633a19d..833eb707 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,8 +2,17 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe
+  name: grafana-user-dedupe-api
   namespace: monitoring
+  annotations:
+    vault.hashicorp.com/agent-inject: "true"
+    vault.hashicorp.com/role: "monitoring"
+    vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
+    vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
+      {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
+      export GRAFANA_USER="{{ index .Data.data "admin-user" }}"
+      export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}"
+      {{ end }}
 spec:
   backoffLimit: 1
   template:
@@ -18,10 +27,15 @@ spec:
           args:
             - |
               set -euo pipefail
-              apk add --no-cache sqlite
-              db="/var/lib/grafana/grafana.db"
-              if [ ! -f "$db" ]; then
-                echo "grafana db not found at $db"
+              apk add --no-cache curl jq
+              . /vault/secrets/grafana-env.sh
+              grafana_url="${GRAFANA_URL}"
+              if [ -z "${grafana_url}" ]; then
+                echo "GRAFANA_URL is required"
+                exit 1
+              fi
+              if [ -z "${GRAFANA_USER}" ] || [ -z "${GRAFANA_PASSWORD}" ]; then
+                echo "Grafana admin credentials missing"
                 exit 1
               fi
               if [ -z "${GRAFANA_DEDUPE_EMAILS}" ]; then
@@ -29,23 +43,19 @@ spec:
                 exit 1
               fi
               for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do
-                ids="$(sqlite3 "$db" "select id from user where email = '${email}';")"
-                if [ -z "$ids" ]; then
+                user_id="$(curl -sf -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \
+                  "${grafana_url}/api/users/lookup?loginOrEmail=${email}" | jq -r '.id // empty')"
+                if [ -z "$user_id" ]; then
                   echo "no grafana user found for ${email}"
                   continue
                 fi
-                echo "deleting grafana users with ids: ${ids}"
-                sqlite3 "$db" "delete from user_auth where user_id in (${ids});"
-                sqlite3 "$db" "delete from user where id in (${ids});"
+                echo "deleting grafana user ${user_id} (${email})"
+                curl -sf -X DELETE -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \
+                  "${grafana_url}/api/admin/users/${user_id}"
               done
               echo "done"
           env:
+            - name: GRAFANA_URL
+              value: http://grafana
             - name: GRAFANA_DEDUPE_EMAILS
               value: brad.stein@gmail.com,brad@bstein.dev
-          volumeMounts:
-            - name: grafana-storage
-              mountPath: /var/lib/grafana
-      volumes:
-        - name: grafana-storage
-          persistentVolumeClaim:
-            claimName: grafana

From 2f37a4786934c3b631dd2b31d20b8e550c7bf67c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:15:03 -0300
Subject: [PATCH 080/416] monitoring: use python dedupe job

---
 .../monitoring/grafana-user-dedupe-job.yaml   | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index 833eb707..f3a1c261 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe-api
+  name: grafana-user-dedupe-api-v2
   namespace: monitoring
   annotations:
     vault.hashicorp.com/agent-inject: "true"
@@ -20,14 +20,13 @@ spec:
       restartPolicy: Never
       containers:
         - name: dedupe
-          image: alpine:3.20
+          image: python:3.12-slim
           command:
             - /bin/sh
             - -c
           args:
             - |
               set -euo pipefail
-              apk add --no-cache curl jq
               . /vault/secrets/grafana-env.sh
               grafana_url="${GRAFANA_URL}"
               if [ -z "${grafana_url}" ]; then
@@ -42,17 +41,41 @@ spec:
                 echo "GRAFANA_DEDUPE_EMAILS is required"
                 exit 1
               fi
-              for email in $(echo "${GRAFANA_DEDUPE_EMAILS}" | tr ',' ' '); do
-                user_id="$(curl -sf -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \
-                  "${grafana_url}/api/users/lookup?loginOrEmail=${email}" | jq -r '.id // empty')"
-                if [ -z "$user_id" ]; then
-                  echo "no grafana user found for ${email}"
-                  continue
-                fi
-                echo "deleting grafana user ${user_id} (${email})"
-                curl -sf -X DELETE -u "${GRAFANA_USER}:${GRAFANA_PASSWORD}" \
-                  "${grafana_url}/api/admin/users/${user_id}"
-              done
+              python - <<'PY'
+              import base64
+              import json
+              import os
+              import urllib.parse
+              import urllib.request
+
+              grafana_url = os.environ["GRAFANA_URL"].rstrip("/")
+              user = os.environ["GRAFANA_USER"]
+              password = os.environ["GRAFANA_PASSWORD"]
+              emails = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()]
+
+              token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8")
+              headers = {"Authorization": f"Basic {token}"}
+
+              def request(method: str, url: str):
+                  req = urllib.request.Request(url, headers=headers, method=method)
+                  with urllib.request.urlopen(req, timeout=10) as resp:
+                      return resp.read()
+
+              for email in emails:
+                  lookup_url = f"{grafana_url}/api/users/lookup?loginOrEmail={urllib.parse.quote(email)}"
+                  try:
+                      payload = json.loads(request("GET", lookup_url))
+                  except Exception:
+                      print(f"no grafana user found for {email}")
+                      continue
+                  user_id = payload.get("id")
+                  if not user_id:
+                      print(f"no grafana user found for {email}")
+                      continue
+                  print(f"deleting grafana user {user_id} ({email})")
+                  delete_url = f"{grafana_url}/api/admin/users/{user_id}"
+                  request("DELETE", delete_url)
+              PY
               echo "done"
           env:
             - name: GRAFANA_URL

From 10704a22d6e7cb732c391d3ea536e29b0065ba5d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:16:26 -0300
Subject: [PATCH 081/416] monitoring: wire vault sa for dedupe job

---
 services/monitoring/grafana-user-dedupe-job.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index f3a1c261..631c25d0 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe-api-v2
+  name: grafana-user-dedupe-api-v3
   namespace: monitoring
   annotations:
     vault.hashicorp.com/agent-inject: "true"
@@ -17,6 +17,8 @@ spec:
   backoffLimit: 1
   template:
     spec:
+      serviceAccountName: monitoring-vault-sync
+      automountServiceAccountToken: true
       restartPolicy: Never
       containers:
         - name: dedupe

From e8859e605a6dec278ab82fe4d855ab025d0ba708 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:18:57 -0300
Subject: [PATCH 082/416] monitoring: prepopulate vault for dedupe job

---
 .../monitoring/grafana-user-dedupe-job.yaml   | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index 631c25d0..3eb001b8 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,20 +2,23 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe-api-v3
+  name: grafana-user-dedupe-api-v4
   namespace: monitoring
-  annotations:
-    vault.hashicorp.com/agent-inject: "true"
-    vault.hashicorp.com/role: "monitoring"
-    vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
-    vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
-      {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
-      export GRAFANA_USER="{{ index .Data.data "admin-user" }}"
-      export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}"
-      {{ end }}
 spec:
   backoffLimit: 1
   template:
+    metadata:
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
+        vault.hashicorp.com/role: "monitoring"
+        vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
+        vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
+          {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
+          export GRAFANA_USER="{{ index .Data.data "admin-user" }}"
+          export GRAFANA_PASSWORD="{{ index .Data.data "admin-password" }}"
+          {{ end }}
     spec:
       serviceAccountName: monitoring-vault-sync
       automountServiceAccountToken: true

From d89d441486ff8819de1e264907d27ff67f227428 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:25:53 -0300
Subject: [PATCH 083/416] monitoring: fix grafana user dedupe job

---
 services/monitoring/grafana-user-dedupe-job.yaml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index 3eb001b8..e56362b9 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe-api-v4
+  name: grafana-user-dedupe-api-v5
   namespace: monitoring
 spec:
   backoffLimit: 1
@@ -10,7 +10,6 @@ spec:
     metadata:
       annotations:
         vault.hashicorp.com/agent-inject: "true"
-        vault.hashicorp.com/agent-pre-populate: "true"
         vault.hashicorp.com/agent-pre-populate-only: "true"
         vault.hashicorp.com/role: "monitoring"
         vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
@@ -32,6 +31,16 @@ spec:
           args:
             - |
               set -euo pipefail
+              for _ in $(seq 1 30); do
+                if [ -f /vault/secrets/grafana-env.sh ]; then
+                  break
+                fi
+                sleep 1
+              done
+              if [ ! -f /vault/secrets/grafana-env.sh ]; then
+                echo "Vault secret not available"
+                exit 1
+              fi
               . /vault/secrets/grafana-env.sh
               grafana_url="${GRAFANA_URL}"
               if [ -z "${grafana_url}" ]; then

From 190caf172949a0dc9d73c11f120f2e9c962bc3a0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:30:08 -0300
Subject: [PATCH 084/416] monitoring: harden grafana user dedupe

---
 .../monitoring/grafana-user-dedupe-job.yaml   | 63 ++++++++++++++-----
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index e56362b9..1d1bd090 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe-api-v5
+  name: grafana-user-dedupe-api-v6
   namespace: monitoring
 spec:
   backoffLimit: 1
@@ -60,35 +60,66 @@ spec:
               import json
               import os
               import urllib.parse
+              import urllib.error
               import urllib.request
 
               grafana_url = os.environ["GRAFANA_URL"].rstrip("/")
               user = os.environ["GRAFANA_USER"]
               password = os.environ["GRAFANA_PASSWORD"]
-              emails = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()]
+              lookups = [e.strip() for e in os.environ["GRAFANA_DEDUPE_EMAILS"].split(",") if e.strip()]
 
               token = base64.b64encode(f"{user}:{password}".encode("utf-8")).decode("utf-8")
               headers = {"Authorization": f"Basic {token}"}
 
               def request(method: str, url: str):
                   req = urllib.request.Request(url, headers=headers, method=method)
-                  with urllib.request.urlopen(req, timeout=10) as resp:
-                      return resp.read()
-
-              for email in emails:
-                  lookup_url = f"{grafana_url}/api/users/lookup?loginOrEmail={urllib.parse.quote(email)}"
                   try:
-                      payload = json.loads(request("GET", lookup_url))
-                  except Exception:
-                      print(f"no grafana user found for {email}")
+                      with urllib.request.urlopen(req, timeout=10) as resp:
+                          return resp.status, resp.read()
+                  except urllib.error.HTTPError as err:
+                      body = err.read()
+                      return err.code, body
+
+              for _ in range(60):
+                  status, _ = request("GET", f"{grafana_url}/api/health")
+                  if status == 200:
+                      break
+              else:
+                  raise SystemExit("Grafana API did not become ready in time")
+
+              for lookup in lookups:
+                  search_url = f"{grafana_url}/api/users/search?query={urllib.parse.quote(lookup)}"
+                  status, body = request("GET", search_url)
+                  if status != 200:
+                      print(f"search failed for {lookup}: status={status} body={body.decode('utf-8', errors='ignore')}")
                       continue
-                  user_id = payload.get("id")
-                  if not user_id:
-                      print(f"no grafana user found for {email}")
+                  payload = json.loads(body)
+                  users = payload.get("users", [])
+                  matches = [
+                      user
+                      for user in users
+                      if user.get("email", "").lower() == lookup.lower()
+                      or user.get("login", "").lower() == lookup.lower()
+                  ]
+                  if not matches:
+                      print(f"no grafana user found for {lookup}")
                       continue
-                  print(f"deleting grafana user {user_id} ({email})")
-                  delete_url = f"{grafana_url}/api/admin/users/{user_id}"
-                  request("DELETE", delete_url)
+                  for user in matches:
+                      user_id = user.get("id")
+                      if not user_id:
+                          continue
+                      print(f"deleting grafana user {user_id} ({user.get('email')})")
+                      delete_url = f"{grafana_url}/api/admin/users/{user_id}"
+                      del_status, del_body = request("DELETE", delete_url)
+                      if del_status not in (200, 202, 204):
+                          print(
+                              "delete failed for",
+                              user_id,
+                              "status",
+                              del_status,
+                              "body",
+                              del_body.decode("utf-8", errors="ignore"),
+                          )
               PY
               echo "done"
           env:

From 4699ffbf2c278c2fd4efb3a58fde60b71a20a37f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 12:31:54 -0300
Subject: [PATCH 085/416] monitoring: reschedule grafana user dedupe

---
 .../monitoring/grafana-user-dedupe-job.yaml    | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/grafana-user-dedupe-job.yaml
index 1d1bd090..8ab1a665 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/grafana-user-dedupe-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: grafana-user-dedupe-api-v6
+  name: grafana-user-dedupe-api-v7
   namespace: monitoring
 spec:
   backoffLimit: 1
@@ -22,6 +22,20 @@ spec:
       serviceAccountName: monitoring-vault-sync
       automountServiceAccountToken: true
       restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["arm64"]
       containers:
         - name: dedupe
           image: python:3.12-slim
@@ -124,6 +138,6 @@ spec:
               echo "done"
           env:
             - name: GRAFANA_URL
-              value: http://grafana
+              value: http://grafana.monitoring.svc.cluster.local
             - name: GRAFANA_DEDUPE_EMAILS
               value: brad.stein@gmail.com,brad@bstein.dev

From fc87432fdfbd11e73fa68bb0d212a8b2b0a405a7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 13:37:36 -0300
Subject: [PATCH 086/416] monitoring: refresh jobs dashboards

---
 scripts/dashboards_render_atlas.py            |  330 ++++--
 services/maintenance/ariadne-deployment.yaml  |    8 +-
 .../{atlas-testing.json => atlas-jobs.json}   | 1034 +++++++++-------
 .../monitoring/dashboards/atlas-overview.json |  284 ++++-
 ...sting.yaml => grafana-dashboard-jobs.yaml} | 1040 ++++++++++-------
 .../grafana-dashboard-overview.yaml           |  284 ++++-
 services/monitoring/helmrelease.yaml          |    6 +-
 services/monitoring/kustomization.yaml        |    2 +-
 8 files changed, 1946 insertions(+), 1042 deletions(-)
 rename services/monitoring/dashboards/{atlas-testing.json => atlas-jobs.json} (84%)
 rename services/monitoring/{grafana-dashboard-testing.yaml => grafana-dashboard-jobs.yaml} (84%)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 6eaafb46..1235a0aa 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -337,16 +337,39 @@ GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
 GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
 GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
 ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
+ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
+ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
 ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
 ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
 ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
 ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
 ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
+ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))'
+ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
 ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
 ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
 ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
 ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
+ARIADNE_TEST_SUCCESS_RATE = (
+    "100 * "
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) '
+    "/ clamp_min("
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)'
+)
+ARIADNE_TEST_FAILURES_24H = (
+    'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
+)
+ONEOFF_JOB_OWNER = (
+    'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
+)
+ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
+ONEOFF_JOB_POD_AGE_HOURS = (
+    '((time() - kube_pod_start_time{pod!=""}) / 3600) '
+    f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
+    '* on(namespace,pod) group_left(phase) '
+    'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
+)
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@@ -798,6 +821,15 @@ def build_overview():
             {"color": "red", "value": 3},
         ],
     }
+    age_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 6},
+            {"color": "orange", "value": 24},
+            {"color": "red", "value": 48},
+        ],
+    }
 
     row1_stats = [
         {
@@ -1000,7 +1032,7 @@ def build_overview():
             30,
             "Mail Sent (1d)",
             'max(postmark_outbound_sent{window="1d"})',
-            {"h": 2, "w": 5, "x": 0, "y": 8},
+            {"h": 3, "w": 5, "x": 0, "y": 8},
             unit="none",
             links=link_to("atlas-mail"),
         )
@@ -1011,7 +1043,7 @@ def build_overview():
             "type": "stat",
             "title": "Mail Bounces (1d)",
             "datasource": PROM_DS,
-            "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8},
+            "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8},
             "targets": [
                 {
                     "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@@ -1057,7 +1089,7 @@ def build_overview():
             32,
             "Mail Success Rate (1d)",
             'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
-            {"h": 2, "w": 5, "x": 5, "y": 8},
+            {"h": 3, "w": 5, "x": 5, "y": 8},
             unit="percent",
             thresholds=mail_success_thresholds,
             decimals=1,
@@ -1069,7 +1101,7 @@ def build_overview():
             33,
             "Mail Limit Used (30d)",
             "max(postmark_sending_limit_used_percent)",
-            {"h": 2, "w": 5, "x": 15, "y": 8},
+            {"h": 3, "w": 5, "x": 15, "y": 8},
             unit="percent",
             thresholds=mail_limit_thresholds,
             decimals=1,
@@ -1089,13 +1121,76 @@ def build_overview():
                 panel_id,
                 title,
                 expr,
-                {"h": 6, "w": 6, "x": 6 * idx, "y": 10},
+                {"h": 5, "w": 6, "x": 6 * idx, "y": 11},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 links=link_to("atlas-storage"),
             )
         )
 
+    panels.append(
+        bargauge_panel(
+            40,
+            "One-off Job Pods (age hours)",
+            ONEOFF_JOB_POD_AGE_HOURS,
+            {"h": 6, "w": 4, "x": 0, "y": 16},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{pod}}",
+            thresholds=age_thresholds,
+            limit=8,
+        )
+    )
+    panels.append(
+        {
+            "id": 41,
+            "type": "timeseries",
+            "title": "Ariadne Attempts vs Failures (1h)",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16},
+            "targets": [
+                {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
+                {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
+            ],
+            "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
+            "options": {
+                "legend": {"displayMode": "table", "placement": "right"},
+                "tooltip": {"mode": "multi"},
+            },
+        }
+    )
+    panels.append(
+        timeseries_panel(
+            42,
+            "Ariadne Test Success Rate",
+            ARIADNE_TEST_SUCCESS_RATE,
+            {"h": 6, "w": 8, "x": 12, "y": 16},
+            unit="percent",
+            legend=None,
+            legend_display="list",
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            43,
+            "Tests with Failures (24h)",
+            ARIADNE_TEST_FAILURES_24H,
+            {"h": 6, "w": 4, "x": 20, "y": 16},
+            unit="none",
+            instant=True,
+            legend="{{result}}",
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "yellow", "value": 1},
+                    {"color": "orange", "value": 5},
+                    {"color": "red", "value": 10},
+                ],
+            },
+        )
+    )
+
     cpu_scope = "$namespace_scope_cpu"
     gpu_scope = "$namespace_scope_gpu"
     ram_scope = "$namespace_scope_ram"
@@ -1105,7 +1200,7 @@ def build_overview():
             11,
             "Namespace CPU Share",
             namespace_cpu_share_expr(cpu_scope),
-            {"h": 9, "w": 8, "x": 0, "y": 16},
+            {"h": 9, "w": 8, "x": 0, "y": 22},
             links=namespace_scope_links("namespace_scope_cpu"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1115,7 +1210,7 @@ def build_overview():
             12,
             "Namespace GPU Share",
             namespace_gpu_share_expr(gpu_scope),
-            {"h": 9, "w": 8, "x": 8, "y": 16},
+            {"h": 9, "w": 8, "x": 8, "y": 22},
             links=namespace_scope_links("namespace_scope_gpu"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1125,7 +1220,7 @@ def build_overview():
             13,
             "Namespace RAM Share",
             namespace_ram_share_expr(ram_scope),
-            {"h": 9, "w": 8, "x": 16, "y": 16},
+            {"h": 9, "w": 8, "x": 16, "y": 22},
             links=namespace_scope_links("namespace_scope_ram"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1137,7 +1232,7 @@ def build_overview():
             14,
             "Worker Node CPU",
             node_cpu_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 0, "y": 32},
+            {"h": 12, "w": 12, "x": 0, "y": 38},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1151,7 +1246,7 @@ def build_overview():
             15,
             "Worker Node RAM",
             node_mem_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 12, "y": 32},
+            {"h": 12, "w": 12, "x": 12, "y": 38},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1166,7 +1261,7 @@ def build_overview():
             16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 0, "y": 44},
+            {"h": 10, "w": 12, "x": 0, "y": 50},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -1178,7 +1273,7 @@ def build_overview():
             17,
             "Control plane RAM",
             node_mem_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 12, "y": 44},
+            {"h": 10, "w": 12, "x": 12, "y": 50},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -1191,7 +1286,7 @@ def build_overview():
             28,
             "Node Pod Share",
             '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
-            {"h": 10, "w": 12, "x": 0, "y": 54},
+            {"h": 10, "w": 12, "x": 0, "y": 60},
         )
     )
     panels.append(
@@ -1199,7 +1294,7 @@ def build_overview():
             29,
             "Top Nodes by Pod Count",
             'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
-            {"h": 10, "w": 12, "x": 12, "y": 54},
+            {"h": 10, "w": 12, "x": 12, "y": 60},
             unit="none",
             limit=12,
             decimals=0,
@@ -1221,7 +1316,7 @@ def build_overview():
             18,
             "Cluster Ingress Throughput",
             NET_INGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 0, "y": 25},
+            {"h": 7, "w": 8, "x": 0, "y": 31},
             unit="Bps",
             legend="Ingress (Traefik)",
             legend_display="list",
@@ -1234,7 +1329,7 @@ def build_overview():
             19,
             "Cluster Egress Throughput",
             NET_EGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 8, "y": 25},
+            {"h": 7, "w": 8, "x": 8, "y": 31},
             unit="Bps",
             legend="Egress (Traefik)",
             legend_display="list",
@@ -1247,7 +1342,7 @@ def build_overview():
             20,
             "Intra-Cluster Throughput",
             NET_INTERNAL_EXPR,
-            {"h": 7, "w": 8, "x": 16, "y": 25},
+            {"h": 7, "w": 8, "x": 16, "y": 31},
             unit="Bps",
             legend="Internal traffic",
             legend_display="list",
@@ -1261,7 +1356,7 @@ def build_overview():
             21,
             "Root Filesystem Usage",
             root_usage_expr(),
-            {"h": 16, "w": 12, "x": 0, "y": 64},
+            {"h": 16, "w": 12, "x": 0, "y": 70},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1276,7 +1371,7 @@ def build_overview():
             22,
             "Nodes Closest to Full Root Disks",
             f"topk(12, {root_usage_expr()})",
-            {"h": 16, "w": 12, "x": 12, "y": 64},
+            {"h": 16, "w": 12, "x": 12, "y": 70},
             unit="percent",
             thresholds=PERCENT_THRESHOLDS,
             links=link_to("atlas-storage"),
@@ -2171,7 +2266,7 @@ def build_mail_dashboard():
     }
 
 
-def build_testing_dashboard():
+def build_jobs_dashboard():
     panels = []
     age_thresholds = {
         "mode": "absolute",
@@ -2192,12 +2287,65 @@ def build_testing_dashboard():
         ],
     }
 
+    task_error_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 1},
+            {"color": "orange", "value": 3},
+            {"color": "red", "value": 5},
+        ],
+    }
+
+    panels.append(
+        bargauge_panel(
+            1,
+            "Ariadne Task Errors (24h)",
+            ARIADNE_TASK_ERRORS_24H,
+            {"h": 7, "w": 6, "x": 0, "y": 0},
+            unit="none",
+            instant=True,
+            legend="{{task}}",
+            thresholds=task_error_thresholds,
+        )
+    )
+    panels.append(
+        {
+            "id": 2,
+            "type": "timeseries",
+            "title": "Ariadne Attempts vs Failures (1h)",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0},
+            "targets": [
+                {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
+                {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
+            ],
+            "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
+            "options": {
+                "legend": {"displayMode": "table", "placement": "right"},
+                "tooltip": {"mode": "multi"},
+            },
+        }
+    )
+    panels.append(
+        bargauge_panel(
+            3,
+            "One-off Job Pods (age hours)",
+            ONEOFF_JOB_POD_AGE_HOURS,
+            {"h": 7, "w": 6, "x": 18, "y": 0},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{pod}}",
+            thresholds=age_thresholds,
+            limit=12,
+        )
+    )
     panels.append(
         stat_panel(
-            1,
+            4,
             "Glue Jobs Stale (>36h)",
             GLUE_STALE_COUNT,
-            {"h": 4, "w": 6, "x": 0, "y": 0},
+            {"h": 4, "w": 4, "x": 0, "y": 7},
             unit="none",
             thresholds={
                 "mode": "absolute",
@@ -2212,99 +2360,47 @@ def build_testing_dashboard():
     )
     panels.append(
         stat_panel(
-            2,
+            5,
             "Glue Jobs Missing Success",
             GLUE_MISSING_COUNT,
-            {"h": 4, "w": 4, "x": 4, "y": 0},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            3,
-            "Glue Jobs Suspended",
-            GLUE_SUSPENDED_COUNT,
-            {"h": 4, "w": 4, "x": 8, "y": 0},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            4,
-            "Ariadne Task Errors (1h)",
-            ARIADNE_TASK_ERRORS_1H_TOTAL,
-            {"h": 4, "w": 4, "x": 12, "y": 0},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            5,
-            "Ariadne Task Errors (24h)",
-            ARIADNE_TASK_ERRORS_24H_TOTAL,
-            {"h": 4, "w": 4, "x": 16, "y": 0},
+            {"h": 4, "w": 4, "x": 4, "y": 7},
             unit="none",
         )
     )
     panels.append(
         stat_panel(
             6,
-            "Ariadne Task Runs (1h)",
-            ARIADNE_TASK_RUNS_1H_TOTAL,
-            {"h": 4, "w": 4, "x": 20, "y": 0},
+            "Glue Jobs Suspended",
+            GLUE_SUSPENDED_COUNT,
+            {"h": 4, "w": 4, "x": 8, "y": 7},
             unit="none",
         )
     )
     panels.append(
-        timeseries_panel(
+        stat_panel(
             7,
-            "Ariadne Task Runs vs Errors (1h)",
-            ARIADNE_TASK_RUNS_BY_STATUS_1H,
-            {"h": 6, "w": 24, "x": 0, "y": 4},
+            "Ariadne Task Errors (1h)",
+            ARIADNE_TASK_ERRORS_1H_TOTAL,
+            {"h": 4, "w": 4, "x": 12, "y": 7},
             unit="none",
-            legend="{{status}}",
-            legend_display="table",
-            legend_placement="right",
         )
     )
     panels.append(
-        bargauge_panel(
+        stat_panel(
             8,
             "Ariadne Task Errors (24h)",
-            ARIADNE_TASK_ERRORS_24H,
-            {"h": 8, "w": 12, "x": 0, "y": 10},
+            ARIADNE_TASK_ERRORS_24H_TOTAL,
+            {"h": 4, "w": 4, "x": 16, "y": 7},
             unit="none",
-            instant=True,
-            legend="{{task}}",
-            thresholds={
-                "mode": "absolute",
-                "steps": [
-                    {"color": "green", "value": None},
-                    {"color": "yellow", "value": 1},
-                    {"color": "orange", "value": 3},
-                    {"color": "red", "value": 5},
-                ],
-            },
         )
     )
     panels.append(
-        bargauge_panel(
+        stat_panel(
             9,
-            "Ariadne Task Success (24h)",
-            ARIADNE_TASK_SUCCESS_24H,
-            {"h": 8, "w": 12, "x": 12, "y": 10},
+            "Ariadne Task Runs (1h)",
+            ARIADNE_TASK_RUNS_1H_TOTAL,
+            {"h": 4, "w": 4, "x": 20, "y": 7},
             unit="none",
-            instant=True,
-            legend="{{task}}",
-            thresholds={
-                "mode": "absolute",
-                "steps": [
-                    {"color": "red", "value": None},
-                    {"color": "orange", "value": 1},
-                    {"color": "yellow", "value": 5},
-                    {"color": "green", "value": 10},
-                ],
-            },
         )
     )
     panels.append(
@@ -2312,7 +2408,7 @@ def build_testing_dashboard():
             10,
             "Ariadne Schedule Last Error (hours ago)",
             ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
-            {"h": 8, "w": 12, "x": 0, "y": 18},
+            {"h": 8, "w": 12, "x": 0, "y": 11},
             unit="h",
             instant=True,
             legend="{{task}}",
@@ -2324,7 +2420,7 @@ def build_testing_dashboard():
             11,
             "Ariadne Schedule Last Success (hours ago)",
             ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
-            {"h": 8, "w": 12, "x": 12, "y": 18},
+            {"h": 8, "w": 12, "x": 12, "y": 11},
             unit="h",
             instant=True,
             legend="{{task}}",
@@ -2336,7 +2432,7 @@ def build_testing_dashboard():
             12,
             "Glue Jobs Last Success (hours ago)",
             GLUE_LAST_SUCCESS_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 0, "y": 26},
+            {"h": 8, "w": 12, "x": 0, "y": 19},
             unit="h",
             instant=True,
             legend="{{namespace}}/{{cronjob}}",
@@ -2348,7 +2444,7 @@ def build_testing_dashboard():
             13,
             "Glue Jobs Last Schedule (hours ago)",
             GLUE_LAST_SCHEDULE_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 12, "y": 26},
+            {"h": 8, "w": 12, "x": 12, "y": 19},
             unit="h",
             instant=True,
             legend="{{namespace}}/{{cronjob}}",
@@ -2358,9 +2454,33 @@ def build_testing_dashboard():
     panels.append(
         bargauge_panel(
             14,
+            "Ariadne Task Errors (1h)",
+            ARIADNE_TASK_ERRORS_1H,
+            {"h": 8, "w": 12, "x": 0, "y": 27},
+            unit="none",
+            instant=True,
+            legend="{{task}}",
+            thresholds=task_error_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            15,
+            "Ariadne Task Errors (30d)",
+            ARIADNE_TASK_ERRORS_30D,
+            {"h": 8, "w": 12, "x": 12, "y": 27},
+            unit="none",
+            instant=True,
+            legend="{{task}}",
+            thresholds=task_error_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            16,
             "Ariadne Access Requests",
             ARIADNE_ACCESS_REQUESTS,
-            {"h": 6, "w": 8, "x": 0, "y": 34},
+            {"h": 6, "w": 8, "x": 0, "y": 35},
             unit="none",
             instant=True,
             legend="{{status}}",
@@ -2368,10 +2488,10 @@ def build_testing_dashboard():
     )
     panels.append(
         stat_panel(
-            15,
+            17,
             "Ariadne CI Coverage (%)",
             ARIADNE_CI_COVERAGE,
-            {"h": 6, "w": 4, "x": 8, "y": 34},
+            {"h": 6, "w": 4, "x": 8, "y": 35},
             unit="percent",
             decimals=1,
             instant=True,
@@ -2380,10 +2500,10 @@ def build_testing_dashboard():
     )
     panels.append(
         table_panel(
-            16,
+            18,
             "Ariadne CI Tests (latest)",
             ARIADNE_CI_TESTS,
-            {"h": 6, "w": 12, "x": 12, "y": 34},
+            {"h": 6, "w": 12, "x": 12, "y": 35},
             unit="none",
             transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
             instant=True,
@@ -2391,8 +2511,8 @@ def build_testing_dashboard():
     )
 
     return {
-        "uid": "atlas-testing",
-        "title": "Atlas Testing",
+        "uid": "atlas-jobs",
+        "title": "Atlas Jobs",
         "folderUid": PRIVATE_FOLDER,
         "editable": True,
         "panels": panels,
@@ -2400,7 +2520,7 @@ def build_testing_dashboard():
         "annotations": {"list": []},
         "schemaVersion": 39,
         "style": "dark",
-        "tags": ["atlas", "testing"],
+        "tags": ["atlas", "jobs", "glue"],
     }
 
 
@@ -2497,9 +2617,9 @@ DASHBOARDS = {
         "builder": build_mail_dashboard,
         "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
     },
-    "atlas-testing": {
-        "builder": build_testing_dashboard,
-        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml",
+    "atlas-jobs": {
+        "builder": build_jobs_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
     },
     "atlas-gpu": {
         "builder": build_gpu_dashboard,
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 069f3885..01e940cf 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -270,7 +270,7 @@ spec:
             - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
               value: "30 4 * * *"
             - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
-              value: "*/15 * * * *"
+              value: "0 * * * *"
             - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
               value: "0 5 * * *"
             - name: ARIADNE_SCHEDULE_WGER_ADMIN
@@ -286,11 +286,11 @@ spec:
             - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
               value: "30 4 * * 0"
             - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
-              value: "*/15 * * * *"
+              value: "0 * * * *"
             - name: ARIADNE_SCHEDULE_VAULT_OIDC
-              value: "*/15 * * * *"
+              value: "0 * * * *"
             - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
-              value: "*/1 * * * *"
+              value: "*/5 * * * *"
             - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
               value: "*/30 * * * *"
             - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-jobs.json
similarity index 84%
rename from services/monitoring/dashboards/atlas-testing.json
rename to services/monitoring/dashboards/atlas-jobs.json
index 420abf26..76e21f01 100644
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@@ -1,416 +1,11 @@
 {
-  "uid": "atlas-testing",
-  "title": "Atlas Testing",
+  "uid": "atlas-jobs",
+  "title": "Atlas Jobs",
   "folderUid": "atlas-internal",
   "editable": true,
   "panels": [
     {
       "id": 1,
-      "type": "stat",
-      "title": "Glue Jobs Stale (>36h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 0,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1
-              },
-              {
-                "color": "orange",
-                "value": 2
-              },
-              {
-                "color": "red",
-                "value": 3
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 2,
-      "type": "stat",
-      "title": "Glue Jobs Missing Success",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 4,
-        "x": 4,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 3,
-      "type": "stat",
-      "title": "Glue Jobs Suspended",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 4,
-        "x": 8,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 4,
-      "type": "stat",
-      "title": "Ariadne Task Errors (1h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 4,
-        "x": 12,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 5,
-      "type": "stat",
-      "title": "Ariadne Task Errors (24h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 4,
-        "x": 16,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 6,
-      "type": "stat",
-      "title": "Ariadne Task Runs (1h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 4,
-        "x": 20,
-        "y": 0
-      },
-      "targets": [
-        {
-          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
-          "refId": "A"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(115, 115, 115, 1)",
-                "value": null
-              },
-              {
-                "color": "green",
-                "value": 1
-              }
-            ]
-          },
-          "unit": "none",
-          "custom": {
-            "displayMode": "auto"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "center",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "value"
-      }
-    },
-    {
-      "id": 7,
-      "type": "timeseries",
-      "title": "Ariadne Task Runs vs Errors (1h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 24,
-        "x": 0,
-        "y": 4
-      },
-      "targets": [
-        {
-          "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))",
-          "refId": "A",
-          "legendFormat": "{{status}}"
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      }
-    },
-    {
-      "id": 8,
       "type": "bargauge",
       "title": "Ariadne Task Errors (24h)",
       "datasource": {
@@ -418,10 +13,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
-        "w": 12,
+        "h": 7,
+        "w": 6,
         "x": 0,
-        "y": 10
+        "y": 0
       },
       "targets": [
         {
@@ -484,50 +79,92 @@
       ]
     },
     {
-      "id": 9,
-      "type": "bargauge",
-      "title": "Ariadne Task Success (24h)",
+      "id": 2,
+      "type": "timeseries",
+      "title": "Ariadne Attempts vs Failures (1h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 7,
         "w": 12,
-        "x": 12,
-        "y": 10
+        "x": 6,
+        "y": 0
       },
       "targets": [
         {
-          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))",
+          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
           "refId": "A",
-          "legendFormat": "{{task}}",
+          "legendFormat": "Attempts"
+        },
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "refId": "B",
+          "legendFormat": "Failures"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "bargauge",
+      "title": "One-off Job Pods (age hours)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{pod}}",
           "instant": true
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "none",
+          "unit": "h",
           "min": 0,
           "max": null,
           "thresholds": {
             "mode": "absolute",
             "steps": [
               {
-                "color": "red",
+                "color": "green",
                 "value": null
               },
-              {
-                "color": "orange",
-                "value": 1
-              },
               {
                 "color": "yellow",
-                "value": 5
+                "value": 6
               },
               {
-                "color": "green",
-                "value": 10
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
               }
             ]
           }
@@ -554,9 +191,383 @@
             ],
             "order": "desc"
           }
+        },
+        {
+          "id": "limit",
+          "options": {
+            "limit": 12
+          }
         }
       ]
     },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Glue Jobs Stale (>36h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 2
+              },
+              {
+                "color": "red",
+                "value": 3
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "Glue Jobs Missing Success",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Glue Jobs Suspended",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 7,
+      "type": "stat",
+      "title": "Ariadne Task Errors (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 8,
+      "type": "stat",
+      "title": "Ariadne Task Errors (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
+    {
+      "id": 9,
+      "type": "stat",
+      "title": "Ariadne Task Runs (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 7
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      }
+    },
     {
       "id": 10,
       "type": "bargauge",
@@ -569,7 +580,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 18
+        "y": 11
       },
       "targets": [
         {
@@ -643,7 +654,7 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 18
+        "y": 11
       },
       "targets": [
         {
@@ -717,7 +728,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 26
+        "y": 19
       },
       "targets": [
         {
@@ -791,7 +802,7 @@
         "h": 8,
         "w": 12,
         "x": 12,
-        "y": 26
+        "y": 19
       },
       "targets": [
         {
@@ -856,6 +867,154 @@
     {
       "id": 14,
       "type": "bargauge",
+      "title": "Ariadne Task Errors (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 27
+      },
+      "targets": [
+        {
+          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 15,
+      "type": "bargauge",
+      "title": "Ariadne Task Errors (30d)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 27
+      },
+      "targets": [
+        {
+          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))",
+          "refId": "A",
+          "legendFormat": "{{task}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 16,
+      "type": "bargauge",
       "title": "Ariadne Access Requests",
       "datasource": {
         "type": "prometheus",
@@ -865,7 +1024,7 @@
         "h": 6,
         "w": 8,
         "x": 0,
-        "y": 34
+        "y": 35
       },
       "targets": [
         {
@@ -928,7 +1087,7 @@
       ]
     },
     {
-      "id": 15,
+      "id": 17,
       "type": "stat",
       "title": "Ariadne CI Coverage (%)",
       "datasource": {
@@ -939,7 +1098,7 @@
         "h": 6,
         "w": 4,
         "x": 8,
-        "y": 34
+        "y": 35
       },
       "targets": [
         {
@@ -991,7 +1150,7 @@
       }
     },
     {
-      "id": 16,
+      "id": 18,
       "type": "table",
       "title": "Ariadne CI Tests (latest)",
       "datasource": {
@@ -1002,7 +1161,7 @@
         "h": 6,
         "w": 12,
         "x": 12,
-        "y": 34
+        "y": 35
       },
       "targets": [
         {
@@ -1052,6 +1211,7 @@
   "style": "dark",
   "tags": [
     "atlas",
-    "testing"
+    "jobs",
+    "glue"
   ]
 }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index c5f30d1f..c3ff327d 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -795,7 +795,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
+        "h": 3,
         "w": 5,
         "x": 0,
         "y": 8
@@ -862,7 +862,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
+        "h": 3,
         "w": 5,
         "x": 10,
         "y": 8
@@ -967,7 +967,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
+        "h": 3,
         "w": 5,
         "x": 5,
         "y": 8
@@ -1043,7 +1043,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 2,
+        "h": 3,
         "w": 5,
         "x": 15,
         "y": 8
@@ -1119,10 +1119,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 5,
         "w": 6,
         "x": 0,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1194,10 +1194,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 5,
         "w": 6,
         "x": 6,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1269,10 +1269,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 5,
         "w": 6,
         "x": 12,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1336,10 +1336,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 6,
+        "h": 5,
         "w": 6,
         "x": 18,
-        "y": 10
+        "y": 11
       },
       "targets": [
         {
@@ -1394,6 +1394,238 @@
         }
       ]
     },
+    {
+      "id": 40,
+      "type": "bargauge",
+      "title": "One-off Job Pods (age hours)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 4,
+        "x": 0,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+          "refId": "A",
+          "legendFormat": "{{namespace}}/{{pod}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "h",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 6
+              },
+              {
+                "color": "orange",
+                "value": 24
+              },
+              {
+                "color": "red",
+                "value": 48
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        },
+        {
+          "id": "limit",
+          "options": {
+            "limit": 8
+          }
+        }
+      ]
+    },
+    {
+      "id": 41,
+      "type": "timeseries",
+      "title": "Ariadne Attempts vs Failures (1h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 4,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+          "refId": "A",
+          "legendFormat": "Attempts"
+        },
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "refId": "B",
+          "legendFormat": "Failures"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 42,
+      "type": "timeseries",
+      "title": "Ariadne Test Success Rate",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 12,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 43,
+      "type": "bargauge",
+      "title": "Tests with Failures (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 4,
+        "x": 20,
+        "y": 16
+      },
+      "targets": [
+        {
+          "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
+          "refId": "A",
+          "legendFormat": "{{result}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 5
+              },
+              {
+                "color": "red",
+                "value": 10
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
     {
       "id": 11,
       "type": "piechart",
@@ -1406,7 +1638,7 @@
         "h": 9,
         "w": 8,
         "x": 0,
-        "y": 16
+        "y": 22
       },
       "targets": [
         {
@@ -1475,7 +1707,7 @@
         "h": 9,
         "w": 8,
         "x": 8,
-        "y": 16
+        "y": 22
       },
       "targets": [
         {
@@ -1544,7 +1776,7 @@
         "h": 9,
         "w": 8,
         "x": 16,
-        "y": 16
+        "y": 22
       },
       "targets": [
         {
@@ -1613,7 +1845,7 @@
         "h": 12,
         "w": 12,
         "x": 0,
-        "y": 32
+        "y": 38
       },
       "targets": [
         {
@@ -1660,7 +1892,7 @@
         "h": 12,
         "w": 12,
         "x": 12,
-        "y": 32
+        "y": 38
       },
       "targets": [
         {
@@ -1707,7 +1939,7 @@
         "h": 10,
         "w": 12,
         "x": 0,
-        "y": 44
+        "y": 50
       },
       "targets": [
         {
@@ -1744,7 +1976,7 @@
         "h": 10,
         "w": 12,
         "x": 12,
-        "y": 44
+        "y": 50
       },
       "targets": [
         {
@@ -1781,7 +2013,7 @@
         "h": 10,
         "w": 12,
         "x": 0,
-        "y": 54
+        "y": 60
       },
       "targets": [
         {
@@ -1832,7 +2064,7 @@
         "h": 10,
         "w": 12,
         "x": 12,
-        "y": 54
+        "y": 60
       },
       "targets": [
         {
@@ -1913,7 +2145,7 @@
         "h": 7,
         "w": 8,
         "x": 0,
-        "y": 25
+        "y": 31
       },
       "targets": [
         {
@@ -1957,7 +2189,7 @@
         "h": 7,
         "w": 8,
         "x": 8,
-        "y": 25
+        "y": 31
       },
       "targets": [
         {
@@ -2001,7 +2233,7 @@
         "h": 7,
         "w": 8,
         "x": 16,
-        "y": 25
+        "y": 31
       },
       "targets": [
         {
@@ -2045,7 +2277,7 @@
         "h": 16,
         "w": 12,
         "x": 0,
-        "y": 64
+        "y": 70
       },
       "targets": [
         {
@@ -2093,7 +2325,7 @@
         "h": 16,
         "w": 12,
         "x": 12,
-        "y": 64
+        "y": 70
       },
       "targets": [
         {
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-jobs.yaml
similarity index 84%
rename from services/monitoring/grafana-dashboard-testing.yaml
rename to services/monitoring/grafana-dashboard-jobs.yaml
index 52b28367..19e0d4eb 100644
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@@ -1,425 +1,20 @@
-# services/monitoring/grafana-dashboard-testing.yaml
+# services/monitoring/grafana-dashboard-jobs.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: grafana-dashboard-testing
+  name: grafana-dashboard-jobs
   labels:
     grafana_dashboard: "1"
 data:
-  atlas-testing.json: |
+  atlas-jobs.json: |
     {
-      "uid": "atlas-testing",
-      "title": "Atlas Testing",
+      "uid": "atlas-jobs",
+      "title": "Atlas Jobs",
       "folderUid": "atlas-internal",
       "editable": true,
       "panels": [
         {
           "id": 1,
-          "type": "stat",
-          "title": "Glue Jobs Stale (>36h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 6,
-            "x": 0,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 1
-                  },
-                  {
-                    "color": "orange",
-                    "value": 2
-                  },
-                  {
-                    "color": "red",
-                    "value": 3
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 2,
-          "type": "stat",
-          "title": "Glue Jobs Missing Success",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 4,
-            "x": 4,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 3,
-          "type": "stat",
-          "title": "Glue Jobs Suspended",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 4,
-            "x": 8,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 4,
-          "type": "stat",
-          "title": "Ariadne Task Errors (1h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 4,
-            "x": 12,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 5,
-          "type": "stat",
-          "title": "Ariadne Task Errors (24h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 4,
-            "x": 16,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 6,
-          "type": "stat",
-          "title": "Ariadne Task Runs (1h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 4,
-            "w": 4,
-            "x": 20,
-            "y": 0
-          },
-          "targets": [
-            {
-              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
-              "refId": "A"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "color": {
-                "mode": "thresholds"
-              },
-              "mappings": [],
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "rgba(115, 115, 115, 1)",
-                    "value": null
-                  },
-                  {
-                    "color": "green",
-                    "value": 1
-                  }
-                ]
-              },
-              "unit": "none",
-              "custom": {
-                "displayMode": "auto"
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "colorMode": "value",
-            "graphMode": "area",
-            "justifyMode": "center",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            },
-            "textMode": "value"
-          }
-        },
-        {
-          "id": 7,
-          "type": "timeseries",
-          "title": "Ariadne Task Runs vs Errors (1h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 6,
-            "w": 24,
-            "x": 0,
-            "y": 4
-          },
-          "targets": [
-            {
-              "expr": "sum by (status) (increase(ariadne_task_runs_total[1h]))",
-              "refId": "A",
-              "legendFormat": "{{status}}"
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none"
-            },
-            "overrides": []
-          },
-          "options": {
-            "legend": {
-              "displayMode": "table",
-              "placement": "right"
-            },
-            "tooltip": {
-              "mode": "multi"
-            }
-          }
-        },
-        {
-          "id": 8,
           "type": "bargauge",
           "title": "Ariadne Task Errors (24h)",
           "datasource": {
@@ -427,10 +22,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
-            "w": 12,
+            "h": 7,
+            "w": 6,
             "x": 0,
-            "y": 10
+            "y": 0
           },
           "targets": [
             {
@@ -493,50 +88,92 @@ data:
           ]
         },
         {
-          "id": 9,
-          "type": "bargauge",
-          "title": "Ariadne Task Success (24h)",
+          "id": 2,
+          "type": "timeseries",
+          "title": "Ariadne Attempts vs Failures (1h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 7,
             "w": 12,
-            "x": 12,
-            "y": 10
+            "x": 6,
+            "y": 0
           },
           "targets": [
             {
-              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[24h]))",
+              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
               "refId": "A",
-              "legendFormat": "{{task}}",
+              "legendFormat": "Attempts"
+            },
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "refId": "B",
+              "legendFormat": "Failures"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 3,
+          "type": "bargauge",
+          "title": "One-off Job Pods (age hours)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 18,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{pod}}",
               "instant": true
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "none",
+              "unit": "h",
               "min": 0,
               "max": null,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "red",
+                    "color": "green",
                     "value": null
                   },
-                  {
-                    "color": "orange",
-                    "value": 1
-                  },
                   {
                     "color": "yellow",
-                    "value": 5
+                    "value": 6
                   },
                   {
-                    "color": "green",
-                    "value": 10
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
                   }
                 ]
               }
@@ -563,9 +200,383 @@ data:
                 ],
                 "order": "desc"
               }
+            },
+            {
+              "id": "limit",
+              "options": {
+                "limit": 12
+              }
             }
           ]
         },
+        {
+          "id": 4,
+          "type": "stat",
+          "title": "Glue Jobs Stale (>36h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 0,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 2
+                  },
+                  {
+                    "color": "red",
+                    "value": 3
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 5,
+          "type": "stat",
+          "title": "Glue Jobs Missing Success",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 4,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 6,
+          "type": "stat",
+          "title": "Glue Jobs Suspended",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 8,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 7,
+          "type": "stat",
+          "title": "Ariadne Task Errors (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 12,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 8,
+          "type": "stat",
+          "title": "Ariadne Task Errors (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 16,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
+        {
+          "id": 9,
+          "type": "stat",
+          "title": "Ariadne Task Runs (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 4,
+            "x": 20,
+            "y": 7
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          }
+        },
         {
           "id": 10,
           "type": "bargauge",
@@ -578,7 +589,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 18
+            "y": 11
           },
           "targets": [
             {
@@ -652,7 +663,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 18
+            "y": 11
           },
           "targets": [
             {
@@ -726,7 +737,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 26
+            "y": 19
           },
           "targets": [
             {
@@ -800,7 +811,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 26
+            "y": 19
           },
           "targets": [
             {
@@ -865,6 +876,154 @@ data:
         {
           "id": 14,
           "type": "bargauge",
+          "title": "Ariadne Task Errors (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 27
+          },
+          "targets": [
+            {
+              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 15,
+          "type": "bargauge",
+          "title": "Ariadne Task Errors (30d)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 27
+          },
+          "targets": [
+            {
+              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))",
+              "refId": "A",
+              "legendFormat": "{{task}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 16,
+          "type": "bargauge",
           "title": "Ariadne Access Requests",
           "datasource": {
             "type": "prometheus",
@@ -874,7 +1033,7 @@ data:
             "h": 6,
             "w": 8,
             "x": 0,
-            "y": 34
+            "y": 35
           },
           "targets": [
             {
@@ -937,7 +1096,7 @@ data:
           ]
         },
         {
-          "id": 15,
+          "id": 17,
           "type": "stat",
           "title": "Ariadne CI Coverage (%)",
           "datasource": {
@@ -948,7 +1107,7 @@ data:
             "h": 6,
             "w": 4,
             "x": 8,
-            "y": 34
+            "y": 35
           },
           "targets": [
             {
@@ -1000,7 +1159,7 @@ data:
           }
         },
         {
-          "id": 16,
+          "id": 18,
           "type": "table",
           "title": "Ariadne CI Tests (latest)",
           "datasource": {
@@ -1011,7 +1170,7 @@ data:
             "h": 6,
             "w": 12,
             "x": 12,
-            "y": 34
+            "y": 35
           },
           "targets": [
             {
@@ -1061,6 +1220,7 @@ data:
       "style": "dark",
       "tags": [
         "atlas",
-        "testing"
+        "jobs",
+        "glue"
       ]
     }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 8ad75238..45969ccf 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -804,7 +804,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
+            "h": 3,
             "w": 5,
             "x": 0,
             "y": 8
@@ -871,7 +871,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
+            "h": 3,
             "w": 5,
             "x": 10,
             "y": 8
@@ -976,7 +976,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
+            "h": 3,
             "w": 5,
             "x": 5,
             "y": 8
@@ -1052,7 +1052,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 2,
+            "h": 3,
             "w": 5,
             "x": 15,
             "y": 8
@@ -1128,10 +1128,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 5,
             "w": 6,
             "x": 0,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1203,10 +1203,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 5,
             "w": 6,
             "x": 6,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1278,10 +1278,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 5,
             "w": 6,
             "x": 12,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1345,10 +1345,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 6,
+            "h": 5,
             "w": 6,
             "x": 18,
-            "y": 10
+            "y": 11
           },
           "targets": [
             {
@@ -1403,6 +1403,238 @@ data:
             }
           ]
         },
+        {
+          "id": 40,
+          "type": "bargauge",
+          "title": "One-off Job Pods (age hours)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 4,
+            "x": 0,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+              "refId": "A",
+              "legendFormat": "{{namespace}}/{{pod}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "h",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 6
+                  },
+                  {
+                    "color": "orange",
+                    "value": 24
+                  },
+                  {
+                    "color": "red",
+                    "value": 48
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            },
+            {
+              "id": "limit",
+              "options": {
+                "limit": 8
+              }
+            }
+          ]
+        },
+        {
+          "id": 41,
+          "type": "timeseries",
+          "title": "Ariadne Attempts vs Failures (1h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 8,
+            "x": 4,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+              "refId": "A",
+              "legendFormat": "Attempts"
+            },
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "refId": "B",
+              "legendFormat": "Failures"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 42,
+          "type": "timeseries",
+          "title": "Ariadne Test Success Rate",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 8,
+            "x": 12,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 43,
+          "type": "bargauge",
+          "title": "Tests with Failures (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 6,
+            "w": 4,
+            "x": 20,
+            "y": 16
+          },
+          "targets": [
+            {
+              "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
+              "refId": "A",
+              "legendFormat": "{{result}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 5
+                  },
+                  {
+                    "color": "red",
+                    "value": 10
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
         {
           "id": 11,
           "type": "piechart",
@@ -1415,7 +1647,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 0,
-            "y": 16
+            "y": 22
           },
           "targets": [
             {
@@ -1484,7 +1716,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 8,
-            "y": 16
+            "y": 22
           },
           "targets": [
             {
@@ -1553,7 +1785,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 16,
-            "y": 16
+            "y": 22
           },
           "targets": [
             {
@@ -1622,7 +1854,7 @@ data:
             "h": 12,
             "w": 12,
             "x": 0,
-            "y": 32
+            "y": 38
           },
           "targets": [
             {
@@ -1669,7 +1901,7 @@ data:
             "h": 12,
             "w": 12,
             "x": 12,
-            "y": 32
+            "y": 38
           },
           "targets": [
             {
@@ -1716,7 +1948,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 44
+            "y": 50
           },
           "targets": [
             {
@@ -1753,7 +1985,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 44
+            "y": 50
           },
           "targets": [
             {
@@ -1790,7 +2022,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 54
+            "y": 60
           },
           "targets": [
             {
@@ -1841,7 +2073,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 54
+            "y": 60
           },
           "targets": [
             {
@@ -1922,7 +2154,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 0,
-            "y": 25
+            "y": 31
           },
           "targets": [
             {
@@ -1966,7 +2198,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 8,
-            "y": 25
+            "y": 31
           },
           "targets": [
             {
@@ -2010,7 +2242,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 16,
-            "y": 25
+            "y": 31
           },
           "targets": [
             {
@@ -2054,7 +2286,7 @@ data:
             "h": 16,
             "w": 12,
             "x": 0,
-            "y": 64
+            "y": 70
           },
           "targets": [
             {
@@ -2102,7 +2334,7 @@ data:
             "h": 16,
             "w": 12,
             "x": 12,
-            "y": 64
+            "y": 70
           },
           "targets": [
             {
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 02bc4821..ac24f8a0 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -471,14 +471,14 @@ spec:
             editable: true
             options:
               path: /var/lib/grafana/dashboards/mail
-          - name: testing
+          - name: jobs
             orgId: 1
             folder: Atlas Internal
             type: file
             disableDeletion: false
             editable: true
             options:
-              path: /var/lib/grafana/dashboards/testing
+              path: /var/lib/grafana/dashboards/jobs
     dashboardsConfigMaps:
       overview: grafana-dashboard-overview
       overview-public: grafana-dashboard-overview
@@ -488,7 +488,7 @@ spec:
       gpu: grafana-dashboard-gpu
       network: grafana-dashboard-network
       mail: grafana-dashboard-mail
-      testing: grafana-dashboard-testing
+      jobs: grafana-dashboard-jobs
     extraConfigmapMounts:
       - name: grafana-folders
         mountPath: /etc/grafana/provisioning/folders
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 86ab8269..59530390 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -14,7 +14,7 @@ resources:
   - grafana-dashboard-network.yaml
   - grafana-dashboard-gpu.yaml
   - grafana-dashboard-mail.yaml
-  - grafana-dashboard-testing.yaml
+  - grafana-dashboard-jobs.yaml
   - dcgm-exporter.yaml
   - jetson-tegrastats-exporter.yaml
   - postmark-exporter-service.yaml

From a8e646f716d7548362e59bb61015370a004eddef Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 16:40:09 +0000
Subject: [PATCH 087/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 05f3be2f..6cb2acd4 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-16 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-17 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 2fd87aea45446dcb57cd8a1d371bd2574ffccf5b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 14:30:55 -0300
Subject: [PATCH 088/416] monitoring: refine jobs/overview panels

---
 scripts/dashboards_render_atlas.py            | 162 ++++++++++++------
 .../monitoring/dashboards/atlas-jobs.json     | 119 ++++++++-----
 .../monitoring/dashboards/atlas-nodes.json    |   4 +-
 .../monitoring/dashboards/atlas-overview.json | 135 +++++++++------
 .../monitoring/dashboards/atlas-pods.json     |   2 +-
 .../monitoring/grafana-dashboard-jobs.yaml    | 119 ++++++++-----
 .../monitoring/grafana-dashboard-nodes.yaml   |   4 +-
 .../grafana-dashboard-overview.yaml           | 135 +++++++++------
 .../monitoring/grafana-dashboard-pods.yaml    |   2 +-
 9 files changed, 446 insertions(+), 236 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 1235a0aa..3d581c70 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -70,6 +70,7 @@ WORKER_NODES = [
     "titan-13",
     "titan-14",
     "titan-15",
+    "titan-16",
     "titan-17",
     "titan-18",
     "titan-19",
@@ -333,9 +334,10 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
 GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
 GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
 GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
-GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
-GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})"
-GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})"
+GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
+GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
+GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
+ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
 ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
 ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
 ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
@@ -344,10 +346,19 @@ ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_to
 ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
 ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
 ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
-ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))'
-ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
+ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
+ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
+ARIADNE_TASK_WARNINGS_SERIES = (
+    'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
+)
 ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
 ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
+ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
+    "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
+)
+ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
+    "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
+)
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
 ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
 ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
@@ -370,6 +381,8 @@ ONEOFF_JOB_POD_AGE_HOURS = (
     '* on(namespace,pod) group_left(phase) '
     'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
 )
+GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
+GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@@ -1032,7 +1045,7 @@ def build_overview():
             30,
             "Mail Sent (1d)",
             'max(postmark_outbound_sent{window="1d"})',
-            {"h": 3, "w": 5, "x": 0, "y": 8},
+            {"h": 3, "w": 6, "x": 0, "y": 8},
             unit="none",
             links=link_to("atlas-mail"),
         )
@@ -1043,7 +1056,7 @@ def build_overview():
             "type": "stat",
             "title": "Mail Bounces (1d)",
             "datasource": PROM_DS,
-            "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8},
+            "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8},
             "targets": [
                 {
                     "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@@ -1089,7 +1102,7 @@ def build_overview():
             32,
             "Mail Success Rate (1d)",
             'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
-            {"h": 3, "w": 5, "x": 5, "y": 8},
+            {"h": 3, "w": 6, "x": 6, "y": 8},
             unit="percent",
             thresholds=mail_success_thresholds,
             decimals=1,
@@ -1101,7 +1114,7 @@ def build_overview():
             33,
             "Mail Limit Used (30d)",
             "max(postmark_sending_limit_used_percent)",
-            {"h": 3, "w": 5, "x": 15, "y": 8},
+            {"h": 3, "w": 6, "x": 18, "y": 8},
             unit="percent",
             thresholds=mail_limit_thresholds,
             decimals=1,
@@ -1121,7 +1134,7 @@ def build_overview():
                 panel_id,
                 title,
                 expr,
-                {"h": 5, "w": 6, "x": 6 * idx, "y": 11},
+                {"h": 3, "w": 6, "x": 6 * idx, "y": 11},
                 unit=unit,
                 thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                 links=link_to("atlas-storage"),
@@ -1133,26 +1146,44 @@ def build_overview():
             40,
             "One-off Job Pods (age hours)",
             ONEOFF_JOB_POD_AGE_HOURS,
-            {"h": 6, "w": 4, "x": 0, "y": 16},
+            {"h": 6, "w": 6, "x": 0, "y": 14},
             unit="h",
             instant=True,
             legend="{{namespace}}/{{pod}}",
             thresholds=age_thresholds,
             limit=8,
+            decimals=2,
         )
     )
     panels.append(
         {
             "id": 41,
             "type": "timeseries",
-            "title": "Ariadne Attempts vs Failures (1h)",
+            "title": "Ariadne Attempts / Warnings / Failures",
             "datasource": PROM_DS,
-            "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16},
+            "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
             "targets": [
-                {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
-                {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
+                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
+                {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"},
+                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"},
             ],
-            "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
+            "fieldConfig": {
+                "defaults": {"unit": "none"},
+                "overrides": [
+                    {
+                        "matcher": {"id": "byName", "options": "Warnings"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}
+                        ],
+                    },
+                    {
+                        "matcher": {"id": "byName", "options": "Failures"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
+                        ],
+                    },
+                ],
+            },
             "options": {
                 "legend": {"displayMode": "table", "placement": "right"},
                 "tooltip": {"mode": "multi"},
@@ -1164,7 +1195,7 @@ def build_overview():
             42,
             "Ariadne Test Success Rate",
             ARIADNE_TEST_SUCCESS_RATE,
-            {"h": 6, "w": 8, "x": 12, "y": 16},
+            {"h": 6, "w": 6, "x": 12, "y": 14},
             unit="percent",
             legend=None,
             legend_display="list",
@@ -1175,7 +1206,7 @@ def build_overview():
             43,
             "Tests with Failures (24h)",
             ARIADNE_TEST_FAILURES_24H,
-            {"h": 6, "w": 4, "x": 20, "y": 16},
+            {"h": 6, "w": 6, "x": 18, "y": 14},
             unit="none",
             instant=True,
             legend="{{result}}",
@@ -1200,7 +1231,7 @@ def build_overview():
             11,
             "Namespace CPU Share",
             namespace_cpu_share_expr(cpu_scope),
-            {"h": 9, "w": 8, "x": 0, "y": 22},
+            {"h": 9, "w": 8, "x": 0, "y": 20},
             links=namespace_scope_links("namespace_scope_cpu"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1210,7 +1241,7 @@ def build_overview():
             12,
             "Namespace GPU Share",
             namespace_gpu_share_expr(gpu_scope),
-            {"h": 9, "w": 8, "x": 8, "y": 22},
+            {"h": 9, "w": 8, "x": 8, "y": 20},
             links=namespace_scope_links("namespace_scope_gpu"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1220,7 +1251,7 @@ def build_overview():
             13,
             "Namespace RAM Share",
             namespace_ram_share_expr(ram_scope),
-            {"h": 9, "w": 8, "x": 16, "y": 22},
+            {"h": 9, "w": 8, "x": 16, "y": 20},
             links=namespace_scope_links("namespace_scope_ram"),
             description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
         )
@@ -1232,7 +1263,7 @@ def build_overview():
             14,
             "Worker Node CPU",
             node_cpu_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 0, "y": 38},
+            {"h": 12, "w": 12, "x": 0, "y": 36},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1246,7 +1277,7 @@ def build_overview():
             15,
             "Worker Node RAM",
             node_mem_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 12, "y": 38},
+            {"h": 12, "w": 12, "x": 12, "y": 36},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1261,7 +1292,7 @@ def build_overview():
             16,
             "Control plane CPU",
             node_cpu_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 0, "y": 50},
+            {"h": 10, "w": 12, "x": 0, "y": 48},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -1273,7 +1304,7 @@ def build_overview():
             17,
             "Control plane RAM",
             node_mem_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 12, "y": 50},
+            {"h": 10, "w": 12, "x": 12, "y": 48},
             unit="percent",
             legend="{{node}}",
             legend_display="table",
@@ -1286,7 +1317,7 @@ def build_overview():
             28,
             "Node Pod Share",
             '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
-            {"h": 10, "w": 12, "x": 0, "y": 60},
+            {"h": 10, "w": 12, "x": 0, "y": 58},
         )
     )
     panels.append(
@@ -1294,7 +1325,7 @@ def build_overview():
             29,
             "Top Nodes by Pod Count",
             'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
-            {"h": 10, "w": 12, "x": 12, "y": 60},
+            {"h": 10, "w": 12, "x": 12, "y": 58},
             unit="none",
             limit=12,
             decimals=0,
@@ -1316,7 +1347,7 @@ def build_overview():
             18,
             "Cluster Ingress Throughput",
             NET_INGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 0, "y": 31},
+            {"h": 7, "w": 8, "x": 0, "y": 29},
             unit="Bps",
             legend="Ingress (Traefik)",
             legend_display="list",
@@ -1329,7 +1360,7 @@ def build_overview():
             19,
             "Cluster Egress Throughput",
             NET_EGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 8, "y": 31},
+            {"h": 7, "w": 8, "x": 8, "y": 29},
             unit="Bps",
             legend="Egress (Traefik)",
             legend_display="list",
@@ -1342,7 +1373,7 @@ def build_overview():
             20,
             "Intra-Cluster Throughput",
             NET_INTERNAL_EXPR,
-            {"h": 7, "w": 8, "x": 16, "y": 31},
+            {"h": 7, "w": 8, "x": 16, "y": 29},
             unit="Bps",
             legend="Internal traffic",
             legend_display="list",
@@ -1356,7 +1387,7 @@ def build_overview():
             21,
             "Root Filesystem Usage",
             root_usage_expr(),
-            {"h": 16, "w": 12, "x": 0, "y": 70},
+            {"h": 16, "w": 12, "x": 0, "y": 68},
             unit="percent",
             legend="{{node}}",
             legend_calcs=["last"],
@@ -1371,7 +1402,7 @@ def build_overview():
             22,
             "Nodes Closest to Full Root Disks",
             f"topk(12, {root_usage_expr()})",
-            {"h": 16, "w": 12, "x": 12, "y": 70},
+            {"h": 16, "w": 12, "x": 12, "y": 68},
             unit="percent",
             thresholds=PERCENT_THRESHOLDS,
             links=link_to("atlas-storage"),
@@ -2300,9 +2331,9 @@ def build_jobs_dashboard():
     panels.append(
         bargauge_panel(
             1,
-            "Ariadne Task Errors (24h)",
-            ARIADNE_TASK_ERRORS_24H,
-            {"h": 7, "w": 6, "x": 0, "y": 0},
+            "Ariadne Task Errors (range)",
+            ARIADNE_TASK_ERRORS_RANGE,
+            {"h": 7, "w": 8, "x": 0, "y": 0},
             unit="none",
             instant=True,
             legend="{{task}}",
@@ -2313,14 +2344,31 @@ def build_jobs_dashboard():
         {
             "id": 2,
             "type": "timeseries",
-            "title": "Ariadne Attempts vs Failures (1h)",
+            "title": "Ariadne Attempts / Warnings / Failures",
             "datasource": PROM_DS,
-            "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0},
+            "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
             "targets": [
-                {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"},
-                {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"},
+                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
+                {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"},
+                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"},
             ],
-            "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
+            "fieldConfig": {
+                "defaults": {"unit": "none"},
+                "overrides": [
+                    {
+                        "matcher": {"id": "byName", "options": "Warnings"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}
+                        ],
+                    },
+                    {
+                        "matcher": {"id": "byName", "options": "Failures"},
+                        "properties": [
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
+                        ],
+                    },
+                ],
+            },
             "options": {
                 "legend": {"displayMode": "table", "placement": "right"},
                 "tooltip": {"mode": "multi"},
@@ -2332,12 +2380,13 @@ def build_jobs_dashboard():
             3,
             "One-off Job Pods (age hours)",
             ONEOFF_JOB_POD_AGE_HOURS,
-            {"h": 7, "w": 6, "x": 18, "y": 0},
+            {"h": 7, "w": 8, "x": 16, "y": 0},
             unit="h",
             instant=True,
             legend="{{namespace}}/{{pod}}",
             thresholds=age_thresholds,
             limit=12,
+            decimals=2,
         )
     )
     panels.append(
@@ -2407,48 +2456,53 @@ def build_jobs_dashboard():
         bargauge_panel(
             10,
             "Ariadne Schedule Last Error (hours ago)",
-            ARIADNE_SCHEDULE_LAST_ERROR_HOURS,
-            {"h": 8, "w": 12, "x": 0, "y": 11},
+            ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 0, "y": 17},
             unit="h",
             instant=True,
             legend="{{task}}",
             thresholds=recent_error_thresholds,
+            sort_order="asc",
+            decimals=2,
         )
     )
     panels.append(
         bargauge_panel(
             11,
             "Ariadne Schedule Last Success (hours ago)",
-            ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS,
-            {"h": 8, "w": 12, "x": 12, "y": 11},
+            ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 12, "y": 17},
             unit="h",
             instant=True,
             legend="{{task}}",
             thresholds=age_thresholds,
+            decimals=2,
         )
     )
     panels.append(
         bargauge_panel(
             12,
             "Glue Jobs Last Success (hours ago)",
-            GLUE_LAST_SUCCESS_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 0, "y": 19},
+            GLUE_LAST_SUCCESS_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 0, "y": 23},
             unit="h",
             instant=True,
             legend="{{namespace}}/{{cronjob}}",
             thresholds=age_thresholds,
+            decimals=2,
         )
     )
     panels.append(
         bargauge_panel(
             13,
             "Glue Jobs Last Schedule (hours ago)",
-            GLUE_LAST_SCHEDULE_AGE_HOURS,
-            {"h": 8, "w": 12, "x": 12, "y": 19},
+            GLUE_LAST_SCHEDULE_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 12, "y": 23},
             unit="h",
             instant=True,
             legend="{{namespace}}/{{cronjob}}",
             thresholds=age_thresholds,
+            decimals=2,
         )
     )
     panels.append(
@@ -2456,7 +2510,7 @@ def build_jobs_dashboard():
             14,
             "Ariadne Task Errors (1h)",
             ARIADNE_TASK_ERRORS_1H,
-            {"h": 8, "w": 12, "x": 0, "y": 27},
+            {"h": 6, "w": 12, "x": 0, "y": 29},
             unit="none",
             instant=True,
             legend="{{task}}",
@@ -2468,7 +2522,7 @@ def build_jobs_dashboard():
             15,
             "Ariadne Task Errors (30d)",
             ARIADNE_TASK_ERRORS_30D,
-            {"h": 8, "w": 12, "x": 12, "y": 27},
+            {"h": 6, "w": 12, "x": 12, "y": 29},
             unit="none",
             instant=True,
             legend="{{task}}",
@@ -2480,7 +2534,7 @@ def build_jobs_dashboard():
             16,
             "Ariadne Access Requests",
             ARIADNE_ACCESS_REQUESTS,
-            {"h": 6, "w": 8, "x": 0, "y": 35},
+            {"h": 6, "w": 8, "x": 0, "y": 11},
             unit="none",
             instant=True,
             legend="{{status}}",
@@ -2491,7 +2545,7 @@ def build_jobs_dashboard():
             17,
             "Ariadne CI Coverage (%)",
             ARIADNE_CI_COVERAGE,
-            {"h": 6, "w": 4, "x": 8, "y": 35},
+            {"h": 6, "w": 4, "x": 8, "y": 11},
             unit="percent",
             decimals=1,
             instant=True,
@@ -2503,7 +2557,7 @@ def build_jobs_dashboard():
             18,
             "Ariadne CI Tests (latest)",
             ARIADNE_CI_TESTS,
-            {"h": 6, "w": 12, "x": 12, "y": 35},
+            {"h": 6, "w": 12, "x": 12, "y": 11},
             unit="none",
             transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
             instant=True,
diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json
index 76e21f01..c70e9c0f 100644
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@@ -7,20 +7,20 @@
     {
       "id": 1,
       "type": "bargauge",
-      "title": "Ariadne Task Errors (24h)",
+      "title": "Ariadne Task Errors (range)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 7,
-        "w": 6,
+        "w": 8,
         "x": 0,
         "y": 0
       },
       "targets": [
         {
-          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -81,26 +81,31 @@
     {
       "id": 2,
       "type": "timeseries",
-      "title": "Ariadne Attempts vs Failures (1h)",
+      "title": "Ariadne Attempts / Warnings / Failures",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 7,
-        "w": 12,
-        "x": 6,
+        "w": 8,
+        "x": 8,
         "y": 0
       },
       "targets": [
         {
-          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+          "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
           "refId": "A",
           "legendFormat": "Attempts"
         },
         {
-          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
           "refId": "B",
+          "legendFormat": "Warnings"
+        },
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+          "refId": "C",
           "legendFormat": "Failures"
         }
       ],
@@ -108,7 +113,38 @@
         "defaults": {
           "unit": "none"
         },
-        "overrides": []
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Warnings"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "yellow"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Failures"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
       },
       "options": {
         "legend": {
@@ -130,8 +166,8 @@
       },
       "gridPos": {
         "h": 7,
-        "w": 6,
-        "x": 18,
+        "w": 8,
+        "x": 16,
         "y": 0
       },
       "targets": [
@@ -167,7 +203,8 @@
                 "value": 48
               }
             ]
-          }
+          },
+          "decimals": 2
         },
         "overrides": []
       },
@@ -216,7 +253,7 @@
       },
       "targets": [
         {
-          "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
+          "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -284,7 +321,7 @@
       },
       "targets": [
         {
-          "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
+          "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -344,7 +381,7 @@
       },
       "targets": [
         {
-          "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
+          "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)",
           "refId": "A"
         }
       ],
@@ -577,14 +614,14 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 6,
         "w": 12,
         "x": 0,
-        "y": 11
+        "y": 17
       },
       "targets": [
         {
-          "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
+          "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -615,7 +652,8 @@
                 "value": 24
               }
             ]
-          }
+          },
+          "decimals": 2
         },
         "overrides": []
       },
@@ -637,7 +675,7 @@
             "fields": [
               "Value"
             ],
-            "order": "desc"
+            "order": "asc"
           }
         }
       ]
@@ -651,14 +689,14 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 6,
         "w": 12,
         "x": 12,
-        "y": 11
+        "y": 17
       },
       "targets": [
         {
-          "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
+          "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -689,7 +727,8 @@
                 "value": 48
               }
             ]
-          }
+          },
+          "decimals": 2
         },
         "overrides": []
       },
@@ -725,14 +764,14 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 6,
         "w": 12,
         "x": 0,
-        "y": 19
+        "y": 23
       },
       "targets": [
         {
-          "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+          "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
           "refId": "A",
           "legendFormat": "{{namespace}}/{{cronjob}}",
           "instant": true
@@ -763,7 +802,8 @@
                 "value": 48
               }
             ]
-          }
+          },
+          "decimals": 2
         },
         "overrides": []
       },
@@ -799,14 +839,14 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 6,
         "w": 12,
         "x": 12,
-        "y": 19
+        "y": 23
       },
       "targets": [
         {
-          "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+          "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
           "refId": "A",
           "legendFormat": "{{namespace}}/{{cronjob}}",
           "instant": true
@@ -837,7 +877,8 @@
                 "value": 48
               }
             ]
-          }
+          },
+          "decimals": 2
         },
         "overrides": []
       },
@@ -873,10 +914,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 6,
         "w": 12,
         "x": 0,
-        "y": 27
+        "y": 29
       },
       "targets": [
         {
@@ -947,10 +988,10 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 8,
+        "h": 6,
         "w": 12,
         "x": 12,
-        "y": 27
+        "y": 29
       },
       "targets": [
         {
@@ -1024,7 +1065,7 @@
         "h": 6,
         "w": 8,
         "x": 0,
-        "y": 35
+        "y": 11
       },
       "targets": [
         {
@@ -1098,7 +1139,7 @@
         "h": 6,
         "w": 4,
         "x": 8,
-        "y": 35
+        "y": 11
       },
       "targets": [
         {
@@ -1161,7 +1202,7 @@
         "h": 6,
         "w": 12,
         "x": 12,
-        "y": 35
+        "y": 11
       },
       "targets": [
         {
diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json
index 2d60042b..ea595792 100644
--- a/services/monitoring/dashboards/atlas-nodes.json
+++ b/services/monitoring/dashboards/atlas-nodes.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
           "refId": "A"
         }
       ],
@@ -46,7 +46,7 @@
           "unit": "none",
           "custom": {
             "displayMode": "auto",
-            "valueSuffix": "/19"
+            "valueSuffix": "/20"
           }
         },
         "overrides": []
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index c3ff327d..5acc2a3a 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -449,14 +449,14 @@
       },
       "targets": [
         {
-          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+          "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
           "refId": "A"
         }
       ],
       "fieldConfig": {
         "defaults": {
           "min": 0,
-          "max": 19,
+          "max": 20,
           "thresholds": {
             "mode": "absolute",
             "steps": [
@@ -466,15 +466,15 @@
               },
               {
                 "color": "orange",
-                "value": 17
-              },
-              {
-                "color": "yellow",
                 "value": 18
               },
               {
-                "color": "green",
+                "color": "yellow",
                 "value": 19
+              },
+              {
+                "color": "green",
+                "value": 20
               }
             ]
           }
@@ -796,7 +796,7 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 5,
+        "w": 6,
         "x": 0,
         "y": 8
       },
@@ -863,8 +863,8 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 5,
-        "x": 10,
+        "w": 6,
+        "x": 12,
         "y": 8
       },
       "targets": [
@@ -968,8 +968,8 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 5,
-        "x": 5,
+        "w": 6,
+        "x": 6,
         "y": 8
       },
       "targets": [
@@ -1044,8 +1044,8 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 5,
-        "x": 15,
+        "w": 6,
+        "x": 18,
         "y": 8
       },
       "targets": [
@@ -1119,7 +1119,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 0,
         "y": 11
@@ -1194,7 +1194,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 6,
         "y": 11
@@ -1269,7 +1269,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 12,
         "y": 11
@@ -1336,7 +1336,7 @@
         "uid": "atlas-vm"
       },
       "gridPos": {
-        "h": 5,
+        "h": 3,
         "w": 6,
         "x": 18,
         "y": 11
@@ -1404,9 +1404,9 @@
       },
       "gridPos": {
         "h": 6,
-        "w": 4,
+        "w": 6,
         "x": 0,
-        "y": 16
+        "y": 14
       },
       "targets": [
         {
@@ -1441,7 +1441,8 @@
                 "value": 48
               }
             ]
-          }
+          },
+          "decimals": 2
         },
         "overrides": []
       },
@@ -1477,26 +1478,31 @@
     {
       "id": 41,
       "type": "timeseries",
-      "title": "Ariadne Attempts vs Failures (1h)",
+      "title": "Ariadne Attempts / Warnings / Failures",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
       },
       "gridPos": {
         "h": 6,
-        "w": 8,
-        "x": 4,
-        "y": 16
+        "w": 6,
+        "x": 6,
+        "y": 14
       },
       "targets": [
         {
-          "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+          "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
           "refId": "A",
           "legendFormat": "Attempts"
         },
         {
-          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
           "refId": "B",
+          "legendFormat": "Warnings"
+        },
+        {
+          "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+          "refId": "C",
           "legendFormat": "Failures"
         }
       ],
@@ -1504,7 +1510,38 @@
         "defaults": {
           "unit": "none"
         },
-        "overrides": []
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Warnings"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "yellow"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Failures"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
       },
       "options": {
         "legend": {
@@ -1526,9 +1563,9 @@
       },
       "gridPos": {
         "h": 6,
-        "w": 8,
+        "w": 6,
         "x": 12,
-        "y": 16
+        "y": 14
       },
       "targets": [
         {
@@ -1562,9 +1599,9 @@
       },
       "gridPos": {
         "h": 6,
-        "w": 4,
-        "x": 20,
-        "y": 16
+        "w": 6,
+        "x": 18,
+        "y": 14
       },
       "targets": [
         {
@@ -1638,7 +1675,7 @@
         "h": 9,
         "w": 8,
         "x": 0,
-        "y": 22
+        "y": 20
       },
       "targets": [
         {
@@ -1707,7 +1744,7 @@
         "h": 9,
         "w": 8,
         "x": 8,
-        "y": 22
+        "y": 20
       },
       "targets": [
         {
@@ -1776,7 +1813,7 @@
         "h": 9,
         "w": 8,
         "x": 16,
-        "y": 22
+        "y": 20
       },
       "targets": [
         {
@@ -1845,11 +1882,11 @@
         "h": 12,
         "w": 12,
         "x": 0,
-        "y": 38
+        "y": 36
       },
       "targets": [
         {
-          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -1892,11 +1929,11 @@
         "h": 12,
         "w": 12,
         "x": 12,
-        "y": 38
+        "y": 36
       },
       "targets": [
         {
-          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+          "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
@@ -1939,7 +1976,7 @@
         "h": 10,
         "w": 12,
         "x": 0,
-        "y": 50
+        "y": 48
       },
       "targets": [
         {
@@ -1976,7 +2013,7 @@
         "h": 10,
         "w": 12,
         "x": 12,
-        "y": 50
+        "y": 48
       },
       "targets": [
         {
@@ -2013,7 +2050,7 @@
         "h": 10,
         "w": 12,
         "x": 0,
-        "y": 60
+        "y": 58
       },
       "targets": [
         {
@@ -2064,7 +2101,7 @@
         "h": 10,
         "w": 12,
         "x": 12,
-        "y": 60
+        "y": 58
       },
       "targets": [
         {
@@ -2145,7 +2182,7 @@
         "h": 7,
         "w": 8,
         "x": 0,
-        "y": 31
+        "y": 29
       },
       "targets": [
         {
@@ -2189,7 +2226,7 @@
         "h": 7,
         "w": 8,
         "x": 8,
-        "y": 31
+        "y": 29
       },
       "targets": [
         {
@@ -2233,7 +2270,7 @@
         "h": 7,
         "w": 8,
         "x": 16,
-        "y": 31
+        "y": 29
       },
       "targets": [
         {
@@ -2277,7 +2314,7 @@
         "h": 16,
         "w": 12,
         "x": 0,
-        "y": 70
+        "y": 68
       },
       "targets": [
         {
@@ -2325,7 +2362,7 @@
         "h": 16,
         "w": 12,
         "x": 12,
-        "y": 70
+        "y": 68
       },
       "targets": [
         {
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index adab84bb..e36aa1fd 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -520,7 +520,7 @@
       },
       "targets": [
         {
-          "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))",
+          "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))",
           "refId": "A",
           "instant": true,
           "format": "table"
diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml
index 19e0d4eb..36c12520 100644
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@@ -16,20 +16,20 @@ data:
         {
           "id": 1,
           "type": "bargauge",
-          "title": "Ariadne Task Errors (24h)",
+          "title": "Ariadne Task Errors (range)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 7,
-            "w": 6,
+            "w": 8,
             "x": 0,
             "y": 0
           },
           "targets": [
             {
-              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))",
+              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -90,26 +90,31 @@ data:
         {
           "id": 2,
           "type": "timeseries",
-          "title": "Ariadne Attempts vs Failures (1h)",
+          "title": "Ariadne Attempts / Warnings / Failures",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 7,
-            "w": 12,
-            "x": 6,
+            "w": 8,
+            "x": 8,
             "y": 0
           },
           "targets": [
             {
-              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+              "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
               "refId": "A",
               "legendFormat": "Attempts"
             },
             {
-              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
               "refId": "B",
+              "legendFormat": "Warnings"
+            },
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+              "refId": "C",
               "legendFormat": "Failures"
             }
           ],
@@ -117,7 +122,38 @@ data:
             "defaults": {
               "unit": "none"
             },
-            "overrides": []
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Warnings"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "yellow"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Failures"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "red"
+                    }
+                  }
+                ]
+              }
+            ]
           },
           "options": {
             "legend": {
@@ -139,8 +175,8 @@ data:
           },
           "gridPos": {
             "h": 7,
-            "w": 6,
-            "x": 18,
+            "w": 8,
+            "x": 16,
             "y": 0
           },
           "targets": [
@@ -176,7 +212,8 @@ data:
                     "value": 48
                   }
                 ]
-              }
+              },
+              "decimals": 2
             },
             "overrides": []
           },
@@ -225,7 +262,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
+              "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -293,7 +330,7 @@ data:
           },
           "targets": [
             {
-              "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))",
+              "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -353,7 +390,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
+              "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)",
               "refId": "A"
             }
           ],
@@ -586,14 +623,14 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 6,
             "w": 12,
             "x": 0,
-            "y": 11
+            "y": 17
           },
           "targets": [
             {
-              "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600",
+              "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -624,7 +661,8 @@ data:
                     "value": 24
                   }
                 ]
-              }
+              },
+              "decimals": 2
             },
             "overrides": []
           },
@@ -646,7 +684,7 @@ data:
                 "fields": [
                   "Value"
                 ],
-                "order": "desc"
+                "order": "asc"
               }
             }
           ]
@@ -660,14 +698,14 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 6,
             "w": 12,
             "x": 12,
-            "y": 11
+            "y": 17
           },
           "targets": [
             {
-              "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600",
+              "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -698,7 +736,8 @@ data:
                     "value": 48
                   }
                 ]
-              }
+              },
+              "decimals": 2
             },
             "overrides": []
           },
@@ -734,14 +773,14 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 6,
             "w": 12,
             "x": 0,
-            "y": 19
+            "y": 23
           },
           "targets": [
             {
-              "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+              "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
               "refId": "A",
               "legendFormat": "{{namespace}}/{{cronjob}}",
               "instant": true
@@ -772,7 +811,8 @@ data:
                     "value": 48
                   }
                 ]
-              }
+              },
+              "decimals": 2
             },
             "overrides": []
           },
@@ -808,14 +848,14 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 6,
             "w": 12,
             "x": 12,
-            "y": 19
+            "y": 23
           },
           "targets": [
             {
-              "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
+              "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
               "refId": "A",
               "legendFormat": "{{namespace}}/{{cronjob}}",
               "instant": true
@@ -846,7 +886,8 @@ data:
                     "value": 48
                   }
                 ]
-              }
+              },
+              "decimals": 2
             },
             "overrides": []
           },
@@ -882,10 +923,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 6,
             "w": 12,
             "x": 0,
-            "y": 27
+            "y": 29
           },
           "targets": [
             {
@@ -956,10 +997,10 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 8,
+            "h": 6,
             "w": 12,
             "x": 12,
-            "y": 27
+            "y": 29
           },
           "targets": [
             {
@@ -1033,7 +1074,7 @@ data:
             "h": 6,
             "w": 8,
             "x": 0,
-            "y": 35
+            "y": 11
           },
           "targets": [
             {
@@ -1107,7 +1148,7 @@ data:
             "h": 6,
             "w": 4,
             "x": 8,
-            "y": 35
+            "y": 11
           },
           "targets": [
             {
@@ -1170,7 +1211,7 @@ data:
             "h": 6,
             "w": 12,
             "x": 12,
-            "y": 35
+            "y": 11
           },
           "targets": [
             {
diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml
index f0f1982d..98123b96 100644
--- a/services/monitoring/grafana-dashboard-nodes.yaml
+++ b/services/monitoring/grafana-dashboard-nodes.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
               "refId": "A"
             }
           ],
@@ -55,7 +55,7 @@ data:
               "unit": "none",
               "custom": {
                 "displayMode": "auto",
-                "valueSuffix": "/19"
+                "valueSuffix": "/20"
               }
             },
             "overrides": []
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 45969ccf..55196e8f 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -458,14 +458,14 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
+              "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})",
               "refId": "A"
             }
           ],
           "fieldConfig": {
             "defaults": {
               "min": 0,
-              "max": 19,
+              "max": 20,
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
@@ -475,15 +475,15 @@ data:
                   },
                   {
                     "color": "orange",
-                    "value": 17
-                  },
-                  {
-                    "color": "yellow",
                     "value": 18
                   },
                   {
-                    "color": "green",
+                    "color": "yellow",
                     "value": 19
+                  },
+                  {
+                    "color": "green",
+                    "value": 20
                   }
                 ]
               }
@@ -805,7 +805,7 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 5,
+            "w": 6,
             "x": 0,
             "y": 8
           },
@@ -872,8 +872,8 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 5,
-            "x": 10,
+            "w": 6,
+            "x": 12,
             "y": 8
           },
           "targets": [
@@ -977,8 +977,8 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 5,
-            "x": 5,
+            "w": 6,
+            "x": 6,
             "y": 8
           },
           "targets": [
@@ -1053,8 +1053,8 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 5,
-            "x": 15,
+            "w": 6,
+            "x": 18,
             "y": 8
           },
           "targets": [
@@ -1128,7 +1128,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 0,
             "y": 11
@@ -1203,7 +1203,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 6,
             "y": 11
@@ -1278,7 +1278,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 12,
             "y": 11
@@ -1345,7 +1345,7 @@ data:
             "uid": "atlas-vm"
           },
           "gridPos": {
-            "h": 5,
+            "h": 3,
             "w": 6,
             "x": 18,
             "y": 11
@@ -1413,9 +1413,9 @@ data:
           },
           "gridPos": {
             "h": 6,
-            "w": 4,
+            "w": 6,
             "x": 0,
-            "y": 16
+            "y": 14
           },
           "targets": [
             {
@@ -1450,7 +1450,8 @@ data:
                     "value": 48
                   }
                 ]
-              }
+              },
+              "decimals": 2
             },
             "overrides": []
           },
@@ -1486,26 +1487,31 @@ data:
         {
           "id": 41,
           "type": "timeseries",
-          "title": "Ariadne Attempts vs Failures (1h)",
+          "title": "Ariadne Attempts / Warnings / Failures",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
           },
           "gridPos": {
             "h": 6,
-            "w": 8,
-            "x": 4,
-            "y": 16
+            "w": 6,
+            "x": 6,
+            "y": 14
           },
           "targets": [
             {
-              "expr": "sum(increase(ariadne_task_runs_total[1h]))",
+              "expr": "sum(increase(ariadne_task_runs_total[$__interval]))",
               "refId": "A",
               "legendFormat": "Attempts"
             },
             {
-              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
               "refId": "B",
+              "legendFormat": "Warnings"
+            },
+            {
+              "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
+              "refId": "C",
               "legendFormat": "Failures"
             }
           ],
@@ -1513,7 +1519,38 @@ data:
             "defaults": {
               "unit": "none"
             },
-            "overrides": []
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Warnings"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "yellow"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "Failures"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "red"
+                    }
+                  }
+                ]
+              }
+            ]
           },
           "options": {
             "legend": {
@@ -1535,9 +1572,9 @@ data:
           },
           "gridPos": {
             "h": 6,
-            "w": 8,
+            "w": 6,
             "x": 12,
-            "y": 16
+            "y": 14
           },
           "targets": [
             {
@@ -1571,9 +1608,9 @@ data:
           },
           "gridPos": {
             "h": 6,
-            "w": 4,
-            "x": 20,
-            "y": 16
+            "w": 6,
+            "x": 18,
+            "y": 14
           },
           "targets": [
             {
@@ -1647,7 +1684,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 0,
-            "y": 22
+            "y": 20
           },
           "targets": [
             {
@@ -1716,7 +1753,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 8,
-            "y": 22
+            "y": 20
           },
           "targets": [
             {
@@ -1785,7 +1822,7 @@ data:
             "h": 9,
             "w": 8,
             "x": 16,
-            "y": 22
+            "y": 20
           },
           "targets": [
             {
@@ -1854,11 +1891,11 @@ data:
             "h": 12,
             "w": 12,
             "x": 0,
-            "y": 38
+            "y": 36
           },
           "targets": [
             {
-              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -1901,11 +1938,11 @@ data:
             "h": 12,
             "w": 12,
             "x": 12,
-            "y": 38
+            "y": 36
           },
           "targets": [
             {
-              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
+              "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
@@ -1948,7 +1985,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 50
+            "y": 48
           },
           "targets": [
             {
@@ -1985,7 +2022,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 50
+            "y": 48
           },
           "targets": [
             {
@@ -2022,7 +2059,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 0,
-            "y": 60
+            "y": 58
           },
           "targets": [
             {
@@ -2073,7 +2110,7 @@ data:
             "h": 10,
             "w": 12,
             "x": 12,
-            "y": 60
+            "y": 58
           },
           "targets": [
             {
@@ -2154,7 +2191,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 0,
-            "y": 31
+            "y": 29
           },
           "targets": [
             {
@@ -2198,7 +2235,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 8,
-            "y": 31
+            "y": 29
           },
           "targets": [
             {
@@ -2242,7 +2279,7 @@ data:
             "h": 7,
             "w": 8,
             "x": 16,
-            "y": 31
+            "y": 29
           },
           "targets": [
             {
@@ -2286,7 +2323,7 @@ data:
             "h": 16,
             "w": 12,
             "x": 0,
-            "y": 70
+            "y": 68
           },
           "targets": [
             {
@@ -2334,7 +2371,7 @@ data:
             "h": 16,
             "w": 12,
             "x": 12,
-            "y": 70
+            "y": 68
           },
           "targets": [
             {
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index f537d4ca..62730238 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -529,7 +529,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))",
+              "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))",
               "refId": "A",
               "instant": true,
               "format": "table"

From 9db260e482a8249dd86b54efb71029ee0d8f4d69 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 15:01:02 -0300
Subject: [PATCH 089/416] monitoring: tighten jobs/overview ordering

---
 scripts/dashboards_render_atlas.py            | 18 +++++++++-
 .../monitoring/dashboards/atlas-jobs.json     |  2 +-
 .../monitoring/dashboards/atlas-overview.json | 36 +++++++++++++++++--
 .../monitoring/grafana-dashboard-jobs.yaml    |  2 +-
 .../grafana-dashboard-overview.yaml           | 36 +++++++++++++++++--
 5 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 3d581c70..c3f36550 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -560,6 +560,7 @@ def timeseries_panel(
     grid,
     *,
     unit="none",
+    max_value=None,
     legend=None,
     legend_display="table",
     legend_placement="bottom",
@@ -584,6 +585,8 @@ def timeseries_panel(
             "tooltip": {"mode": "multi"},
         },
     }
+    if max_value is not None:
+        panel["fieldConfig"]["defaults"]["max"] = max_value
     if legend:
         panel["targets"][0]["legendFormat"] = legend
     if legend_calcs:
@@ -742,6 +745,7 @@ def bargauge_panel(
     thresholds=None,
     decimals=None,
     instant=False,
+    overrides=None,
 ):
     """Return a bar gauge panel with label-aware reduction."""
     panel = {
@@ -786,6 +790,8 @@ def bargauge_panel(
             },
         },
     }
+    if overrides:
+        panel["fieldConfig"]["overrides"].extend(overrides)
     if decimals is not None:
         panel["fieldConfig"]["defaults"]["decimals"] = decimals
     if links:
@@ -1197,6 +1203,7 @@ def build_overview():
             ARIADNE_TEST_SUCCESS_RATE,
             {"h": 6, "w": 6, "x": 12, "y": 14},
             unit="percent",
+            max_value=100,
             legend=None,
             legend_display="list",
         )
@@ -1210,6 +1217,16 @@ def build_overview():
             unit="none",
             instant=True,
             legend="{{result}}",
+            overrides=[
+                {
+                    "matcher": {"id": "byName", "options": "error"},
+                    "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
+                },
+                {
+                    "matcher": {"id": "byName", "options": "failed"},
+                    "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
+                },
+            ],
             thresholds={
                 "mode": "absolute",
                 "steps": [
@@ -2462,7 +2479,6 @@ def build_jobs_dashboard():
             instant=True,
             legend="{{task}}",
             thresholds=recent_error_thresholds,
-            sort_order="asc",
             decimals=2,
         )
     )
diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json
index c70e9c0f..810b3b35 100644
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@@ -675,7 +675,7 @@
             "fields": [
               "Value"
             ],
-            "order": "asc"
+            "order": "desc"
           }
         }
       ]
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 5acc2a3a..3feb5311 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1575,7 +1575,8 @@
       ],
       "fieldConfig": {
         "defaults": {
-          "unit": "percent"
+          "unit": "percent",
+          "max": 100
         },
         "overrides": []
       },
@@ -1638,7 +1639,38 @@
             ]
           }
         },
-        "overrides": []
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "error"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "yellow"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "failed"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
       },
       "options": {
         "displayMode": "gradient",
diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml
index 36c12520..279d959f 100644
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@@ -684,7 +684,7 @@ data:
                 "fields": [
                   "Value"
                 ],
-                "order": "asc"
+                "order": "desc"
               }
             }
           ]
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 55196e8f..66b6da0a 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1584,7 +1584,8 @@ data:
           ],
           "fieldConfig": {
             "defaults": {
-              "unit": "percent"
+              "unit": "percent",
+              "max": 100
             },
             "overrides": []
           },
@@ -1647,7 +1648,38 @@ data:
                 ]
               }
             },
-            "overrides": []
+            "overrides": [
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "error"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "yellow"
+                    }
+                  }
+                ]
+              },
+              {
+                "matcher": {
+                  "id": "byName",
+                  "options": "failed"
+                },
+                "properties": [
+                  {
+                    "id": "color",
+                    "value": {
+                      "mode": "fixed",
+                      "fixedColor": "red"
+                    }
+                  }
+                ]
+              }
+            ]
           },
           "options": {
             "displayMode": "gradient",

From e0308b89fdbaa3eff67389e8ed00e630be8fe325 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 21 Jan 2026 15:12:53 -0300
Subject: [PATCH 090/416] monitoring: enforce sorted job lists

---
 scripts/dashboards_render_atlas.py            | 24 ++++++++------
 .../monitoring/dashboards/atlas-jobs.json     | 31 ++++++++-----------
 .../monitoring/dashboards/atlas-overview.json | 21 +++++--------
 .../monitoring/dashboards/atlas-pods.json     |  2 +-
 .../monitoring/grafana-dashboard-jobs.yaml    | 31 ++++++++-----------
 .../grafana-dashboard-overview.yaml           | 21 +++++--------
 .../monitoring/grafana-dashboard-pods.yaml    |  2 +-
 7 files changed, 58 insertions(+), 74 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index c3f36550..1f284895 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -748,6 +748,12 @@ def bargauge_panel(
     overrides=None,
 ):
     """Return a bar gauge panel with label-aware reduction."""
+    cleaned_expr = expr.strip()
+    if not cleaned_expr.startswith(("sort(", "sort_desc(")):
+        if sort_order == "desc":
+            expr = f"sort_desc({expr})"
+        elif sort_order == "asc":
+            expr = f"sort({expr})"
     panel = {
         "id": panel_id,
         "type": "bargauge",
@@ -1165,21 +1171,20 @@ def build_overview():
         {
             "id": 41,
             "type": "timeseries",
-            "title": "Ariadne Attempts / Warnings / Failures",
+            "title": "Ariadne Attempts / Failures",
             "datasource": PROM_DS,
             "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
             "targets": [
                 {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
-                {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"},
-                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"},
+                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
             ],
             "fieldConfig": {
                 "defaults": {"unit": "none"},
                 "overrides": [
                     {
-                        "matcher": {"id": "byName", "options": "Warnings"},
+                        "matcher": {"id": "byName", "options": "Attempts"},
                         "properties": [
-                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
                         ],
                     },
                     {
@@ -2361,21 +2366,20 @@ def build_jobs_dashboard():
         {
             "id": 2,
             "type": "timeseries",
-            "title": "Ariadne Attempts / Warnings / Failures",
+            "title": "Ariadne Attempts / Failures",
             "datasource": PROM_DS,
             "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
             "targets": [
                 {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
-                {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"},
-                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"},
+                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
             ],
             "fieldConfig": {
                 "defaults": {"unit": "none"},
                 "overrides": [
                     {
-                        "matcher": {"id": "byName", "options": "Warnings"},
+                        "matcher": {"id": "byName", "options": "Attempts"},
                         "properties": [
-                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}
+                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
                         ],
                     },
                     {
diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json
index 810b3b35..37b888d8 100644
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))",
+          "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -81,7 +81,7 @@
     {
       "id": 2,
       "type": "timeseries",
-      "title": "Ariadne Attempts / Warnings / Failures",
+      "title": "Ariadne Attempts / Failures",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -98,14 +98,9 @@
           "refId": "A",
           "legendFormat": "Attempts"
         },
-        {
-          "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
-          "refId": "B",
-          "legendFormat": "Warnings"
-        },
         {
           "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
-          "refId": "C",
+          "refId": "B",
           "legendFormat": "Failures"
         }
       ],
@@ -117,14 +112,14 @@
           {
             "matcher": {
               "id": "byName",
-              "options": "Warnings"
+              "options": "Attempts"
             },
             "properties": [
               {
                 "id": "color",
                 "value": {
                   "mode": "fixed",
-                  "fixedColor": "yellow"
+                  "fixedColor": "green"
                 }
               }
             ]
@@ -172,7 +167,7 @@
       },
       "targets": [
         {
-          "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+          "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
           "refId": "A",
           "legendFormat": "{{namespace}}/{{pod}}",
           "instant": true
@@ -621,7 +616,7 @@
       },
       "targets": [
         {
-          "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600",
+          "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -696,7 +691,7 @@
       },
       "targets": [
         {
-          "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600",
+          "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -771,7 +766,7 @@
       },
       "targets": [
         {
-          "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
+          "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
           "refId": "A",
           "legendFormat": "{{namespace}}/{{cronjob}}",
           "instant": true
@@ -846,7 +841,7 @@
       },
       "targets": [
         {
-          "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
+          "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
           "refId": "A",
           "legendFormat": "{{namespace}}/{{cronjob}}",
           "instant": true
@@ -921,7 +916,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+          "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -995,7 +990,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))",
+          "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))",
           "refId": "A",
           "legendFormat": "{{task}}",
           "instant": true
@@ -1069,7 +1064,7 @@
       },
       "targets": [
         {
-          "expr": "ariadne_access_requests_total",
+          "expr": "sort_desc(ariadne_access_requests_total)",
           "refId": "A",
           "legendFormat": "{{status}}",
           "instant": true
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 3feb5311..78744dac 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1410,7 +1410,7 @@
       },
       "targets": [
         {
-          "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+          "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
           "refId": "A",
           "legendFormat": "{{namespace}}/{{pod}}",
           "instant": true
@@ -1478,7 +1478,7 @@
     {
       "id": 41,
       "type": "timeseries",
-      "title": "Ariadne Attempts / Warnings / Failures",
+      "title": "Ariadne Attempts / Failures",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -1495,14 +1495,9 @@
           "refId": "A",
           "legendFormat": "Attempts"
         },
-        {
-          "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
-          "refId": "B",
-          "legendFormat": "Warnings"
-        },
         {
           "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
-          "refId": "C",
+          "refId": "B",
           "legendFormat": "Failures"
         }
       ],
@@ -1514,14 +1509,14 @@
           {
             "matcher": {
               "id": "byName",
-              "options": "Warnings"
+              "options": "Attempts"
             },
             "properties": [
               {
                 "id": "color",
                 "value": {
                   "mode": "fixed",
-                  "fixedColor": "yellow"
+                  "fixedColor": "green"
                 }
               }
             ]
@@ -1606,7 +1601,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
+          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
           "refId": "A",
           "legendFormat": "{{result}}",
           "instant": true
@@ -2137,7 +2132,7 @@
       },
       "targets": [
         {
-          "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+          "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
@@ -2398,7 +2393,7 @@
       },
       "targets": [
         {
-          "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+          "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json
index e36aa1fd..0c8104c9 100644
--- a/services/monitoring/dashboards/atlas-pods.json
+++ b/services/monitoring/dashboards/atlas-pods.json
@@ -439,7 +439,7 @@
       },
       "targets": [
         {
-          "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+          "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
           "refId": "A",
           "legendFormat": "{{node}}",
           "instant": true
diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml
index 279d959f..b16c9cbb 100644
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))",
+              "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -90,7 +90,7 @@ data:
         {
           "id": 2,
           "type": "timeseries",
-          "title": "Ariadne Attempts / Warnings / Failures",
+          "title": "Ariadne Attempts / Failures",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -107,14 +107,9 @@ data:
               "refId": "A",
               "legendFormat": "Attempts"
             },
-            {
-              "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
-              "refId": "B",
-              "legendFormat": "Warnings"
-            },
             {
               "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
-              "refId": "C",
+              "refId": "B",
               "legendFormat": "Failures"
             }
           ],
@@ -126,14 +121,14 @@ data:
               {
                 "matcher": {
                   "id": "byName",
-                  "options": "Warnings"
+                  "options": "Attempts"
                 },
                 "properties": [
                   {
                     "id": "color",
                     "value": {
                       "mode": "fixed",
-                      "fixedColor": "yellow"
+                      "fixedColor": "green"
                     }
                   }
                 ]
@@ -181,7 +176,7 @@ data:
           },
           "targets": [
             {
-              "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+              "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
               "refId": "A",
               "legendFormat": "{{namespace}}/{{pod}}",
               "instant": true
@@ -630,7 +625,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600",
+              "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -705,7 +700,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600",
+              "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -780,7 +775,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
+              "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
               "refId": "A",
               "legendFormat": "{{namespace}}/{{cronjob}}",
               "instant": true
@@ -855,7 +850,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
+              "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
               "refId": "A",
               "legendFormat": "{{namespace}}/{{cronjob}}",
               "instant": true
@@ -930,7 +925,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
+              "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -1004,7 +999,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))",
+              "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))",
               "refId": "A",
               "legendFormat": "{{task}}",
               "instant": true
@@ -1078,7 +1073,7 @@ data:
           },
           "targets": [
             {
-              "expr": "ariadne_access_requests_total",
+              "expr": "sort_desc(ariadne_access_requests_total)",
               "refId": "A",
               "legendFormat": "{{status}}",
               "instant": true
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 66b6da0a..fa19911f 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1419,7 +1419,7 @@ data:
           },
           "targets": [
             {
-              "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
+              "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
               "refId": "A",
               "legendFormat": "{{namespace}}/{{pod}}",
               "instant": true
@@ -1487,7 +1487,7 @@ data:
         {
           "id": 41,
           "type": "timeseries",
-          "title": "Ariadne Attempts / Warnings / Failures",
+          "title": "Ariadne Attempts / Failures",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -1504,14 +1504,9 @@ data:
               "refId": "A",
               "legendFormat": "Attempts"
             },
-            {
-              "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
-              "refId": "B",
-              "legendFormat": "Warnings"
-            },
             {
               "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
-              "refId": "C",
+              "refId": "B",
               "legendFormat": "Failures"
             }
           ],
@@ -1523,14 +1518,14 @@ data:
               {
                 "matcher": {
                   "id": "byName",
-                  "options": "Warnings"
+                  "options": "Attempts"
                 },
                 "properties": [
                   {
                     "id": "color",
                     "value": {
                       "mode": "fixed",
-                      "fixedColor": "yellow"
+                      "fixedColor": "green"
                     }
                   }
                 ]
@@ -1615,7 +1610,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
+              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
               "refId": "A",
               "legendFormat": "{{result}}",
               "instant": true
@@ -2146,7 +2141,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+              "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true
@@ -2407,7 +2402,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
+              "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml
index 62730238..1461eac6 100644
--- a/services/monitoring/grafana-dashboard-pods.yaml
+++ b/services/monitoring/grafana-dashboard-pods.yaml
@@ -448,7 +448,7 @@ data:
           },
           "targets": [
             {
-              "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
+              "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
               "refId": "A",
               "legendFormat": "{{node}}",
               "instant": true

From 03ad3374e1f2dd77c77f2174a14a1864fff7f172 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 20:04:15 +0000
Subject: [PATCH 091/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 78f5e685..e43f30ed 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 42ac893378a03b2cba453d04b7894f9ce07fe411 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 20:05:15 +0000
Subject: [PATCH 092/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index e43f30ed..ee57a11e 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-108 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 9a5421f5f956b4f3ed9e5ab510e794d979b738e1 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 20:33:18 +0000
Subject: [PATCH 093/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index ee57a11e..60180873 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From b748f6de2bb40c73cd4d16c266e9a1a13113c2b9 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 20:34:18 +0000
Subject: [PATCH 094/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 60180873..87cb6350 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-109 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 88e834cbe82db5006fa714335036c3a02d48beaf Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:05:29 +0000
Subject: [PATCH 095/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 87cb6350..9d4896b3 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From ea9a59d02daa14fc1edef4d321cdc0447f62cc3e Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:07:29 +0000
Subject: [PATCH 096/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 9d4896b3..8ba3cb03 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-111 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 922510ec4a97aeb198c9f5773f7d9836d437beb5 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:23:44 +0000
Subject: [PATCH 097/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 6cb2acd4..9b78f342 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-17 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-18 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 89b476eac3315914ec026d672b42dbf357b5b469 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:30:31 +0000
Subject: [PATCH 098/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 8ba3cb03..36decfa0 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 089548e2e3f1d64c3f0a96cf27e4865f05158a4a Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:32:31 +0000
Subject: [PATCH 099/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 36decfa0..9aa6d820 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-112 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From ea3666f2c3a067d0d75c2a6005e0f818d6692ea6 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:52:46 +0000
Subject: [PATCH 100/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 9b78f342..6c5ff2ec 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-18 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-19 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 9261415175c10b00f712918908af9b38d0a6b9ea Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:55:34 +0000
Subject: [PATCH 101/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 9aa6d820..52341a72 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From dbe54c795f651cb3344e531b0d605e52a3f74369 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 22:56:34 +0000
Subject: [PATCH 102/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 52341a72..e133abec 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-113 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 78300e028ee2b32a45557af035dcc764d2cd5e7d Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 23:24:37 +0000
Subject: [PATCH 103/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index e133abec..7e381abc 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From d033e4a0e310ac8889d7f0d17414aeea4052b2fc Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 23:24:40 +0000
Subject: [PATCH 104/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 7e381abc..58688918 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-114 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 7d399fd01bb15441cc996e7967ac9eb14df56f7a Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 23:47:39 +0000
Subject: [PATCH 105/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 58688918..6f195140 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 0a6fa06fca6300933fb5bedd08f01d1385c8302e Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Wed, 21 Jan 2026 23:48:39 +0000
Subject: [PATCH 106/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 6f195140..94ccbced 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-115 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From fb66469e576c7ba7426aff5af3e9c7a170120579 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 00:16:42 +0000
Subject: [PATCH 107/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 94ccbced..e0137923 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 1cc241a8ea6d749cca2761b67bd9ec6cac6a6dc4 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 00:17:42 +0000
Subject: [PATCH 108/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index e0137923..d4a8429f 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-116 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From f8aa085326d979317cf01631a8ac8aa336a5c18b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 00:48:58 +0000
Subject: [PATCH 109/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 6c5ff2ec..84759a4d 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-19 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-20 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 7e3404b53853e16e0c0b098029dc005735c1c607 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 00:59:59 +0000
Subject: [PATCH 110/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 84759a4d..1f1c7316 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -25,7 +25,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-20 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 58c2af9e853c69cbaa131a16c3d91edb4954f6c4 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 05:37:20 +0000
Subject: [PATCH 111/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index d4a8429f..db933330 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From c78b156cf5a68a725b0a50818e004942336036de Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 05:38:20 +0000
Subject: [PATCH 112/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index db933330..8e945e01 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-117 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 01e3b85321c1ab506066c47ab51fb23bd8682506 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 05:40:21 +0000
Subject: [PATCH 113/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 8e945e01..bf79e8bb 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From c4ad908edf0e6e393ea570988b530f7af42820a2 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 05:41:20 +0000
Subject: [PATCH 114/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index bf79e8bb..192ad7e3 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-118 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From ee4af80e15c11573a3bf22a5a15843a25cf96045 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 03:15:19 -0300
Subject: [PATCH 115/416] jenkins: use shared harbor creds when present

---
 services/jenkins/deployment.yaml                   | 6 ++++++
 services/vault/scripts/vault_k8s_auth_configure.sh | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 0b62ee09..0dc76afd 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -34,6 +34,12 @@ spec:
           HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
           HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
           {{ end }}
+          {{ with secret "kv/data/atlas/shared/harbor-pull" }}
+          {{- if and .Data.data.username .Data.data.password }}
+          HARBOR_ROBOT_USERNAME={{ .Data.data.username }}
+          HARBOR_ROBOT_PASSWORD={{ .Data.data.password }}
+          {{- end }}
+          {{ end }}
           {{ with secret "kv/data/atlas/jenkins/gitea-pat" }}
           GITEA_PAT_USERNAME={{ .Data.data.username }}
           GITEA_PAT_TOKEN={{ .Data.data.token }}
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index bc03cf4c..00fa567c 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -219,7 +219,7 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
   "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
 write_policy_and_role "jenkins" "jenkins" "jenkins" \
-  "jenkins/*" ""
+  "jenkins/* shared/harbor-pull" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
   "monitoring/* shared/postmark-relay shared/harbor-pull" ""
 write_policy_and_role "logging" "logging" "logging-vault-sync" \

From 096bb329e69a47ffcf11946cd49a92ec9873626d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 04:45:24 -0300
Subject: [PATCH 116/416] jenkins: sync harbor pull secret from vault

---
 services/jenkins/kustomization.yaml           |  3 ++
 services/jenkins/secretproviderclass.yaml     | 21 ++++++++++++
 services/jenkins/vault-serviceaccount.yaml    |  6 ++++
 services/jenkins/vault-sync-deployment.yaml   | 34 +++++++++++++++++++
 .../vault/scripts/vault_k8s_auth_configure.sh |  2 +-
 5 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 services/jenkins/secretproviderclass.yaml
 create mode 100644 services/jenkins/vault-serviceaccount.yaml
 create mode 100644 services/jenkins/vault-sync-deployment.yaml

diff --git a/services/jenkins/kustomization.yaml b/services/jenkins/kustomization.yaml
index aab859ab..df519685 100644
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@@ -5,11 +5,14 @@ namespace: jenkins
 resources:
   - namespace.yaml
   - serviceaccount.yaml
+  - vault-serviceaccount.yaml
   - pvc.yaml
   - cache-pvc.yaml
   - plugins-pvc.yaml
   - configmap-jcasc.yaml
   - configmap-plugins.yaml
+  - secretproviderclass.yaml
+  - vault-sync-deployment.yaml
   - deployment.yaml
   - service.yaml
   - ingress.yaml
diff --git a/services/jenkins/secretproviderclass.yaml b/services/jenkins/secretproviderclass.yaml
new file mode 100644
index 00000000..a9d9dd50
--- /dev/null
+++ b/services/jenkins/secretproviderclass.yaml
@@ -0,0 +1,21 @@
+# services/jenkins/secretproviderclass.yaml
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: jenkins-vault
+  namespace: jenkins
+spec:
+  provider: vault
+  parameters:
+    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
+    roleName: "jenkins"
+    objects: |
+      - objectName: "harbor-pull__dockerconfigjson"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
+        secretKey: "dockerconfigjson"
+  secretObjects:
+    - secretName: harbor-bstein-robot
+      type: kubernetes.io/dockerconfigjson
+      data:
+        - objectName: harbor-pull__dockerconfigjson
+          key: .dockerconfigjson
diff --git a/services/jenkins/vault-serviceaccount.yaml b/services/jenkins/vault-serviceaccount.yaml
new file mode 100644
index 00000000..8d314003
--- /dev/null
+++ b/services/jenkins/vault-serviceaccount.yaml
@@ -0,0 +1,6 @@
+# services/jenkins/vault-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: jenkins-vault-sync
+  namespace: jenkins
diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml
new file mode 100644
index 00000000..6de64f9e
--- /dev/null
+++ b/services/jenkins/vault-sync-deployment.yaml
@@ -0,0 +1,34 @@
+# services/jenkins/vault-sync-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: jenkins-vault-sync
+  namespace: jenkins
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: jenkins-vault-sync
+  template:
+    metadata:
+      labels:
+        app: jenkins-vault-sync
+    spec:
+      serviceAccountName: jenkins-vault-sync
+      containers:
+        - name: sync
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - "sleep infinity"
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /vault/secrets
+              readOnly: true
+      volumes:
+        - name: vault-secrets
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: jenkins-vault
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index 00fa567c..a956e0e5 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -218,7 +218,7 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
   "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
   "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
-write_policy_and_role "jenkins" "jenkins" "jenkins" \
+write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
   "jenkins/* shared/harbor-pull" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
   "monitoring/* shared/postmark-relay shared/harbor-pull" ""

From daf8be2d43fb4468ebf9c715760a3c43a7ad16c6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 04:47:50 -0300
Subject: [PATCH 117/416] vault: unsuspend k8s auth config cronjob

---
 services/vault/k8s-auth-config-cronjob.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml
index e7cca14e..43da16b4 100644
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@@ -8,7 +8,7 @@ metadata:
     atlas.bstein.dev/glue: "true"
 spec:
   schedule: "*/15 * * * *"
-  suspend: true
+  suspend: false
   concurrencyPolicy: Forbid
   successfulJobsHistoryLimit: 1
   failedJobsHistoryLimit: 3

From abb39d43281b9fdebf723a381f09d12dd9f59b82 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 10:56:27 -0300
Subject: [PATCH 118/416] jenkins: pin vault sync to worker nodes

---
 services/jenkins/vault-sync-deployment.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/services/jenkins/vault-sync-deployment.yaml b/services/jenkins/vault-sync-deployment.yaml
index 6de64f9e..6abcacef 100644
--- a/services/jenkins/vault-sync-deployment.yaml
+++ b/services/jenkins/vault-sync-deployment.yaml
@@ -15,6 +15,9 @@ spec:
         app: jenkins-vault-sync
     spec:
       serviceAccountName: jenkins-vault-sync
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
       containers:
         - name: sync
           image: alpine:3.20

From c985a45113c4b6ca63d2814584b97cae1bacbe6f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 12:41:58 -0300
Subject: [PATCH 119/416] keycloak: allow harbor direct grants

---
 .../harbor-oidc-secret-ensure-job.yaml        |  2 +-
 .../scripts/harbor_oidc_secret_ensure.sh      | 37 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/harbor-oidc-secret-ensure-job.yaml
index 8eac50d1..87de4632 100644
--- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/harbor-oidc-secret-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: harbor-oidc-secret-ensure-9
+  name: harbor-oidc-secret-ensure-10
   namespace: sso
 spec:
   backoffLimit: 0
diff --git a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh
index 7187d343..c70caa28 100755
--- a/services/keycloak/scripts/harbor_oidc_secret_ensure.sh
+++ b/services/keycloak/scripts/harbor_oidc_secret_ensure.sh
@@ -29,7 +29,7 @@ CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
 CLIENT_ID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
 
 if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
-  create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":false,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}'
+  create_payload='{"clientId":"harbor","enabled":true,"protocol":"openid-connect","publicClient":false,"standardFlowEnabled":true,"implicitFlowEnabled":false,"directAccessGrantsEnabled":true,"serviceAccountsEnabled":false,"redirectUris":["https://registry.bstein.dev/c/oidc/callback"],"webOrigins":["https://registry.bstein.dev"],"rootUrl":"https://registry.bstein.dev","baseUrl":"/"}'
   status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
     -H "Authorization: Bearer ${ACCESS_TOKEN}" \
     -H 'Content-Type: application/json' \
@@ -49,6 +49,21 @@ if [ -z "$CLIENT_ID" ] || [ "$CLIENT_ID" = "null" ]; then
   exit 1
 fi
 
+CLIENT_CONFIG="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+  "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}" || true)"
+if [ -n "$CLIENT_CONFIG" ]; then
+  updated_config="$(echo "$CLIENT_CONFIG" | jq '.directAccessGrantsEnabled=true')"
+  status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
+    -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+    -H 'Content-Type: application/json' \
+    -d "${updated_config}" \
+    "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}")"
+  if [ "$status" != "200" ] && [ "$status" != "204" ]; then
+    echo "Keycloak client update failed (status ${status})" >&2
+    exit 1
+  fi
+fi
+
 SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
   "$KC_URL/admin/realms/atlas/client-scopes?search=groups" | jq -r '.[] | select(.name=="groups") | .id' 2>/dev/null | head -n1 || true)"
 if [ -z "$SCOPE_ID" ] || [ "$SCOPE_ID" = "null" ]; then
@@ -77,6 +92,26 @@ if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="groups")' >/dev/null 2
   fi
 fi
 
+OFFLINE_SCOPE_ID="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+  "$KC_URL/admin/realms/atlas/client-scopes?search=offline_access" | jq -r '.[] | select(.name=="offline_access") | .id' 2>/dev/null | head -n1 || true)"
+if [ -n "$OFFLINE_SCOPE_ID" ] && [ "$OFFLINE_SCOPE_ID" != "null" ]; then
+  if ! echo "$DEFAULT_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1 \
+    && ! echo "$OPTIONAL_SCOPES" | jq -e '.[] | select(.name=="offline_access")' >/dev/null 2>&1; then
+    status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
+      -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+      "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")"
+    if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
+      status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
+        -H "Authorization: Bearer ${ACCESS_TOKEN}" \
+        "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/optional-client-scopes/${OFFLINE_SCOPE_ID}")"
+      if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
+        echo "Failed to attach offline_access scope to harbor (status ${status})" >&2
+        exit 1
+      fi
+    fi
+  fi
+fi
+
 CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
   "$KC_URL/admin/realms/atlas/clients/${CLIENT_ID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
 if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then

From 1fc431af766846ea5ce4e23d5b4907fbfb5b4bf0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 13:26:38 -0300
Subject: [PATCH 120/416] harbor: route v2 ingress to registry

---
 services/harbor/helmrelease.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml
index b0cbdbda..db017873 100644
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@@ -378,6 +378,16 @@ spec:
                             subPath: app.conf
                           - name: ca-download
                             mountPath: /etc/core/ca
+          - target:
+              kind: Ingress
+              name: harbor-ingress
+            patch: |-
+              - op: replace
+                path: /spec/rules/0/http/paths/2/backend/service/name
+                value: harbor-registry
+              - op: replace
+                path: /spec/rules/0/http/paths/2/backend/service/port/number
+                value: 5000
                           - name: psc
                             mountPath: /etc/core/token
                     volumes:

From fc0943b1a64e7237d9e43a1a1a8c16845795a05a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 13:31:12 -0300
Subject: [PATCH 121/416] harbor: fix ingress patch placement

---
 services/harbor/helmrelease.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml
index db017873..16b81a8b 100644
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@@ -378,16 +378,6 @@ spec:
                             subPath: app.conf
                           - name: ca-download
                             mountPath: /etc/core/ca
-          - target:
-              kind: Ingress
-              name: harbor-ingress
-            patch: |-
-              - op: replace
-                path: /spec/rules/0/http/paths/2/backend/service/name
-                value: harbor-registry
-              - op: replace
-                path: /spec/rules/0/http/paths/2/backend/service/port/number
-                value: 5000
                           - name: psc
                             mountPath: /etc/core/token
                     volumes:
@@ -401,6 +391,16 @@ spec:
                         $patch: delete
                       - name: core-writable
                         emptyDir: {}
+          - target:
+              kind: Ingress
+              name: harbor-ingress
+            patch: |-
+              - op: replace
+                path: /spec/rules/0/http/paths/2/backend/service/name
+                value: harbor-registry
+              - op: replace
+                path: /spec/rules/0/http/paths/2/backend/service/port/number
+                value: 5000
           - target:
               kind: Deployment
               name: harbor-jobservice

From fd8330a8ab789c8a469d41291220510e39879eeb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 13:38:06 -0300
Subject: [PATCH 122/416] flux: temporarily drop harbor health checks

---
 .../atlas/flux-system/applications/harbor/kustomization.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
index 06baf268..5eec32fc 100644
--- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
@@ -13,11 +13,6 @@ spec:
     kind: GitRepository
     name: flux-system
     namespace: flux-system
-  healthChecks:
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
-      name: harbor
-      namespace: harbor
   wait: false
   dependsOn:
     - name: core

From 373e33a178037481fdbe7a2a04972563127cb47e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 14:09:39 -0300
Subject: [PATCH 123/416] ops: pause portal/ariadne and add migrate jobs

---
 .../bstein-dev-home/backend-deployment.yaml   | 16 ++++++-
 .../chat-ai-gateway-deployment.yaml           |  2 +-
 .../bstein-dev-home/frontend-deployment.yaml  |  2 +-
 services/bstein-dev-home/kustomization.yaml   |  1 +
 .../bstein-dev-home/portal-migrate-job.yaml   | 41 ++++++++++++++++++
 .../vault-sync-deployment.yaml                |  2 +-
 services/maintenance/ariadne-deployment.yaml  | 16 ++++++-
 services/maintenance/ariadne-migrate-job.yaml | 42 +++++++++++++++++++
 services/maintenance/kustomization.yaml       |  1 +
 9 files changed, 118 insertions(+), 5 deletions(-)
 create mode 100644 services/bstein-dev-home/portal-migrate-job.yaml
 create mode 100644 services/maintenance/ariadne-migrate-job.yaml

diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index 074a19d0..100c3ebc 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: bstein-dev-home-backend
   namespace: bstein-dev-home
 spec:
-  replicas: 1
+  replicas: 0
   revisionHistoryLimit: 3
   selector:
     matchLabels:
@@ -99,6 +99,20 @@ spec:
               value: ""
             - name: HTTP_CHECK_TIMEOUT_SEC
               value: "2"
+            - name: PORTAL_DB_POOL_MIN
+              value: "0"
+            - name: PORTAL_DB_POOL_MAX
+              value: "5"
+            - name: PORTAL_DB_CONNECT_TIMEOUT_SEC
+              value: "5"
+            - name: PORTAL_DB_LOCK_TIMEOUT_SEC
+              value: "5"
+            - name: PORTAL_DB_STATEMENT_TIMEOUT_SEC
+              value: "30"
+            - name: PORTAL_DB_IDLE_IN_TX_TIMEOUT_SEC
+              value: "10"
+            - name: PORTAL_RUN_MIGRATIONS
+              value: "false"
             - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
               value: "30"
             - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC
diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
index 40d74fe1..3010a9b0 100644
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: chat-ai-gateway
   namespace: bstein-dev-home
 spec:
-  replicas: 1
+  replicas: 0
   revisionHistoryLimit: 2
   selector:
     matchLabels:
diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml
index ef26e73a..bbe5981a 100644
--- a/services/bstein-dev-home/frontend-deployment.yaml
+++ b/services/bstein-dev-home/frontend-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: bstein-dev-home-frontend
   namespace: bstein-dev-home
 spec:
-  replicas: 1
+  replicas: 0
   revisionHistoryLimit: 3
   selector:
     matchLabels:
diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 192ad7e3..28bbc3a8 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -15,6 +15,7 @@ resources:
   - frontend-service.yaml
   - backend-deployment.yaml
   - backend-service.yaml
+  - portal-migrate-job.yaml
   - vaultwarden-cred-sync-cronjob.yaml
   - portal-onboarding-e2e-test-job.yaml
   - ingress.yaml
diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml
new file mode 100644
index 00000000..303a04fc
--- /dev/null
+++ b/services/bstein-dev-home/portal-migrate-job.yaml
@@ -0,0 +1,41 @@
+# services/bstein-dev-home/portal-migrate-job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bstein-dev-home-portal-migrate
+  namespace: bstein-dev-home
+spec:
+  backoffLimit: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      labels:
+        app: bstein-dev-home-portal-migrate
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/role: "bstein-dev-home"
+        vault.hashicorp.com/agent-inject-secret-portal-env.sh: "kv/data/atlas/portal/atlas-portal-db"
+        vault.hashicorp.com/agent-inject-template-portal-env.sh: |
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ end }}
+    spec:
+      serviceAccountName: bstein-dev-home
+      restartPolicy: Never
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      imagePullSecrets:
+        - name: harbor-regcred
+      containers:
+        - name: migrate
+          image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-95
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-c"]
+          args:
+            - >-
+              . /vault/secrets/portal-env.sh
+              && exec python -m atlas_portal.migrate
+          env:
+            - name: PORTAL_RUN_MIGRATIONS
+              value: "true"
diff --git a/services/bstein-dev-home/vault-sync-deployment.yaml b/services/bstein-dev-home/vault-sync-deployment.yaml
index ad50f1e8..2f2ddbbe 100644
--- a/services/bstein-dev-home/vault-sync-deployment.yaml
+++ b/services/bstein-dev-home/vault-sync-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: bstein-dev-home-vault-sync
   namespace: bstein-dev-home
 spec:
-  replicas: 1
+  replicas: 0
   selector:
     matchLabels:
       app: bstein-dev-home-vault-sync
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 01e940cf..e11f8db2 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: ariadne
   namespace: maintenance
 spec:
-  replicas: 1
+  replicas: 0
   revisionHistoryLimit: 3
   selector:
     matchLabels:
@@ -129,6 +129,20 @@ spec:
               value: https://bstein.dev
             - name: ARIADNE_LOG_LEVEL
               value: INFO
+            - name: ARIADNE_DB_POOL_MIN
+              value: "0"
+            - name: ARIADNE_DB_POOL_MAX
+              value: "5"
+            - name: ARIADNE_DB_CONNECT_TIMEOUT_SEC
+              value: "5"
+            - name: ARIADNE_DB_LOCK_TIMEOUT_SEC
+              value: "5"
+            - name: ARIADNE_DB_STATEMENT_TIMEOUT_SEC
+              value: "30"
+            - name: ARIADNE_DB_IDLE_IN_TX_TIMEOUT_SEC
+              value: "10"
+            - name: ARIADNE_RUN_MIGRATIONS
+              value: "false"
             - name: PORTAL_ADMIN_USERS
               value: bstein
             - name: PORTAL_ADMIN_GROUPS
diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml
new file mode 100644
index 00000000..472cf5f5
--- /dev/null
+++ b/services/maintenance/ariadne-migrate-job.yaml
@@ -0,0 +1,42 @@
+# services/maintenance/ariadne-migrate-job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: ariadne-migrate
+  namespace: maintenance
+spec:
+  backoffLimit: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      labels:
+        app: ariadne-migrate
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/role: "maintenance"
+        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
+        vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |
+          {{ with secret "kv/data/atlas/maintenance/ariadne-db" }}
+          export ARIADNE_DATABASE_URL="{{ .Data.data.database_url }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/portal/atlas-portal-db" }}
+          export PORTAL_DATABASE_URL="{{ .Data.data.PORTAL_DATABASE_URL }}"
+          {{ end }}
+    spec:
+      serviceAccountName: ariadne
+      restartPolicy: Never
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: migrate
+          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          imagePullPolicy: Always
+          command: ["/bin/sh", "-c"]
+          args:
+            - >-
+              . /vault/secrets/ariadne-env.sh
+              && exec python -m ariadne.migrate
+          env:
+            - name: ARIADNE_RUN_MIGRATIONS
+              value: "true"
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 1f1c7316..c1350ebf 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -14,6 +14,7 @@ resources:
   - node-nofile-serviceaccount.yaml
   - pod-cleaner-rbac.yaml
   - ariadne-deployment.yaml
+  - ariadne-migrate-job.yaml
   - ariadne-service.yaml
   - disable-k3s-traefik-daemonset.yaml
   - k3s-traefik-cleanup-job.yaml

From 307d1bf7a6e5d5875a48e6570922e449cccdd158 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 15:23:23 -0300
Subject: [PATCH 124/416] ops: restore portal/ariadne and add postgres panels

---
 scripts/dashboards_render_atlas.py            |  36 ++++-
 .../bstein-dev-home/backend-deployment.yaml   |   2 +-
 .../bstein-dev-home/frontend-deployment.yaml  |   2 +-
 .../vault-sync-deployment.yaml                |   2 +-
 services/maintenance/ariadne-deployment.yaml  |   2 +-
 .../monitoring/dashboards/atlas-overview.json | 138 +++++++++++++++++-
 .../grafana-dashboard-overview.yaml           | 138 +++++++++++++++++-
 7 files changed, 298 insertions(+), 22 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 1f284895..f55896ad 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -371,6 +371,10 @@ ARIADNE_TEST_SUCCESS_RATE = (
 ARIADNE_TEST_FAILURES_24H = (
     'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
 )
+POSTGRES_CONN_USED_PCT = (
+    "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)"
+)
+POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
 ONEOFF_JOB_OWNER = (
     'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
 )
@@ -1057,7 +1061,7 @@ def build_overview():
             30,
             "Mail Sent (1d)",
             'max(postmark_outbound_sent{window="1d"})',
-            {"h": 3, "w": 6, "x": 0, "y": 8},
+            {"h": 3, "w": 4, "x": 0, "y": 8},
             unit="none",
             links=link_to("atlas-mail"),
         )
@@ -1068,7 +1072,7 @@ def build_overview():
             "type": "stat",
             "title": "Mail Bounces (1d)",
             "datasource": PROM_DS,
-            "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8},
+            "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
             "targets": [
                 {
                     "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@@ -1114,7 +1118,7 @@ def build_overview():
             32,
             "Mail Success Rate (1d)",
             'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
-            {"h": 3, "w": 6, "x": 6, "y": 8},
+            {"h": 3, "w": 4, "x": 4, "y": 8},
             unit="percent",
             thresholds=mail_success_thresholds,
             decimals=1,
@@ -1126,13 +1130,37 @@ def build_overview():
             33,
             "Mail Limit Used (30d)",
             "max(postmark_sending_limit_used_percent)",
-            {"h": 3, "w": 6, "x": 18, "y": 8},
+            {"h": 3, "w": 4, "x": 12, "y": 8},
             unit="percent",
             thresholds=mail_limit_thresholds,
             decimals=1,
             links=link_to("atlas-mail"),
         )
     )
+    panels.append(
+        gauge_panel(
+            34,
+            "Postgres Connections Used",
+            POSTGRES_CONN_USED_PCT,
+            {"h": 3, "w": 4, "x": 16, "y": 8},
+            min_value=0,
+            max_value=100,
+            thresholds=PERCENT_THRESHOLDS,
+        )
+    )
+    panels.append(
+        stat_panel(
+            35,
+            "Postgres Hottest Connections",
+            POSTGRES_CONN_HOTTEST,
+            {"h": 3, "w": 4, "x": 20, "y": 8},
+            unit="none",
+            decimals=0,
+            text_mode="name_and_value",
+            legend="{{datname}}",
+            instant=True,
+        )
+    )
 
     storage_panels = [
         (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index 100c3ebc..2170396e 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: bstein-dev-home-backend
   namespace: bstein-dev-home
 spec:
-  replicas: 0
+  replicas: 1
   revisionHistoryLimit: 3
   selector:
     matchLabels:
diff --git a/services/bstein-dev-home/frontend-deployment.yaml b/services/bstein-dev-home/frontend-deployment.yaml
index bbe5981a..ef26e73a 100644
--- a/services/bstein-dev-home/frontend-deployment.yaml
+++ b/services/bstein-dev-home/frontend-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: bstein-dev-home-frontend
   namespace: bstein-dev-home
 spec:
-  replicas: 0
+  replicas: 1
   revisionHistoryLimit: 3
   selector:
     matchLabels:
diff --git a/services/bstein-dev-home/vault-sync-deployment.yaml b/services/bstein-dev-home/vault-sync-deployment.yaml
index 2f2ddbbe..ad50f1e8 100644
--- a/services/bstein-dev-home/vault-sync-deployment.yaml
+++ b/services/bstein-dev-home/vault-sync-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: bstein-dev-home-vault-sync
   namespace: bstein-dev-home
 spec:
-  replicas: 0
+  replicas: 1
   selector:
     matchLabels:
       app: bstein-dev-home-vault-sync
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index e11f8db2..581947c6 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: ariadne
   namespace: maintenance
 spec:
-  replicas: 0
+  replicas: 1
   revisionHistoryLimit: 3
   selector:
     matchLabels:
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 78744dac..93a2d803 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -796,7 +796,7 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 6,
+        "w": 4,
         "x": 0,
         "y": 8
       },
@@ -863,8 +863,8 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 6,
-        "x": 12,
+        "w": 4,
+        "x": 8,
         "y": 8
       },
       "targets": [
@@ -968,8 +968,8 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 6,
-        "x": 6,
+        "w": 4,
+        "x": 4,
         "y": 8
       },
       "targets": [
@@ -1044,8 +1044,8 @@
       },
       "gridPos": {
         "h": 3,
-        "w": 6,
-        "x": 18,
+        "w": 4,
+        "x": 12,
         "y": 8
       },
       "targets": [
@@ -1110,6 +1110,130 @@
         }
       ]
     },
+    {
+      "id": 34,
+      "type": "gauge",
+      "title": "Postgres Connections Used",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 4,
+        "x": 16,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "orange",
+                "value": 75
+              },
+              {
+                "color": "red",
+                "value": 91.5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "orientation": "auto",
+        "showThresholdMarkers": false,
+        "showThresholdLabels": false
+      }
+    },
+    {
+      "id": 35,
+      "type": "stat",
+      "title": "Postgres Hottest Connections",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 4,
+        "x": 20,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "topk(1, sum by (datname) (pg_stat_activity_count))",
+          "refId": "A",
+          "legendFormat": "{{datname}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(115, 115, 115, 1)",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "name_and_value"
+      }
+    },
     {
       "id": 23,
       "type": "stat",
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index fa19911f..0e9526ef 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -805,7 +805,7 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 6,
+            "w": 4,
             "x": 0,
             "y": 8
           },
@@ -872,8 +872,8 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 6,
-            "x": 12,
+            "w": 4,
+            "x": 8,
             "y": 8
           },
           "targets": [
@@ -977,8 +977,8 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 6,
-            "x": 6,
+            "w": 4,
+            "x": 4,
             "y": 8
           },
           "targets": [
@@ -1053,8 +1053,8 @@ data:
           },
           "gridPos": {
             "h": 3,
-            "w": 6,
-            "x": 18,
+            "w": 4,
+            "x": 12,
             "y": 8
           },
           "targets": [
@@ -1119,6 +1119,130 @@ data:
             }
           ]
         },
+        {
+          "id": 34,
+          "type": "gauge",
+          "title": "Postgres Connections Used",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 4,
+            "x": 16,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "min": 0,
+              "max": 100,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 50
+                  },
+                  {
+                    "color": "orange",
+                    "value": 75
+                  },
+                  {
+                    "color": "red",
+                    "value": 91.5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "orientation": "auto",
+            "showThresholdMarkers": false,
+            "showThresholdLabels": false
+          }
+        },
+        {
+          "id": 35,
+          "type": "stat",
+          "title": "Postgres Hottest Connections",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 4,
+            "x": 20,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "topk(1, sum by (datname) (pg_stat_activity_count))",
+              "refId": "A",
+              "legendFormat": "{{datname}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "rgba(115, 115, 115, 1)",
+                    "value": null
+                  },
+                  {
+                    "color": "green",
+                    "value": 1
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 0
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "name_and_value"
+          }
+        },
         {
           "id": 23,
           "type": "stat",

From 3e165975087dcb60d57f34b32b65f95b4e66cd8a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 15:28:26 -0300
Subject: [PATCH 125/416] ops: bump portal and ariadne image tags

---
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 services/maintenance/kustomization.yaml     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 28bbc3a8..7c431b29 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,9 +21,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index c1350ebf..992c8890 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 74458dd82e698b15050076cae3ccbaf19b6c4dcb Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:29:01 +0000
Subject: [PATCH 126/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 7c431b29..28bbc3a8 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,9 +21,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From dc8238ec16769bce44f3f80980a5279774c34b79 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:29:24 +0000
Subject: [PATCH 127/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 992c8890..c1350ebf 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From c9e972539cb175f1108f0e42d02465b5c8e600ab Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 15:33:08 -0300
Subject: [PATCH 128/416] images: auth image scan and bump tags

---
 services/bstein-dev-home/image.yaml         | 4 ++++
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 services/maintenance/image.yaml             | 2 ++
 services/maintenance/kustomization.yaml     | 2 +-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/services/bstein-dev-home/image.yaml b/services/bstein-dev-home/image.yaml
index 3b6c7579..eed2736b 100644
--- a/services/bstein-dev-home/image.yaml
+++ b/services/bstein-dev-home/image.yaml
@@ -7,6 +7,8 @@ metadata:
 spec:
   image: registry.bstein.dev/bstein/bstein-dev-home-frontend
   interval: 1m0s
+  secretRef:
+    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
@@ -28,6 +30,8 @@ metadata:
 spec:
   image: registry.bstein.dev/bstein/bstein-dev-home-backend
   interval: 1m0s
+  secretRef:
+    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 28bbc3a8..7c431b29 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,9 +21,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home
diff --git a/services/maintenance/image.yaml b/services/maintenance/image.yaml
index 95acbd0b..fd28d902 100644
--- a/services/maintenance/image.yaml
+++ b/services/maintenance/image.yaml
@@ -7,6 +7,8 @@ metadata:
 spec:
   image: registry.bstein.dev/bstein/ariadne
   interval: 1m0s
+  secretRef:
+    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index c1350ebf..992c8890 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From aa8e20470cdd8129bf814154746bd544f4563be8 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:33:30 +0000
Subject: [PATCH 129/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 992c8890..c1350ebf 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 518b4dba4f9094e76557ba385a1963d9f1dbe32f Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:33:48 +0000
Subject: [PATCH 130/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index c1350ebf..992c8890 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-21 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 53ad965a6e9c3f262dc48ddab98ba210aa4ed194 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:34:08 +0000
Subject: [PATCH 131/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 7c431b29..28bbc3a8 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,9 +21,9 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 1018e08d549e4b693b5c4a90eccaf29d6a09be4d Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:35:15 +0000
Subject: [PATCH 132/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 28bbc3a8..8bfc8a5e 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 834c6d275c8cc74287e7809968f7d4964bbfe605 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:35:20 +0000
Subject: [PATCH 133/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 8bfc8a5e..7c431b29 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From e6083868812de98ea806fa43a6cf4d754647477f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 15:39:57 -0300
Subject: [PATCH 134/416] jobs: force recreate migrate jobs

---
 services/bstein-dev-home/portal-migrate-job.yaml | 2 ++
 services/maintenance/ariadne-migrate-job.yaml    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml
index 303a04fc..a578b8c8 100644
--- a/services/bstein-dev-home/portal-migrate-job.yaml
+++ b/services/bstein-dev-home/portal-migrate-job.yaml
@@ -4,6 +4,8 @@ kind: Job
 metadata:
   name: bstein-dev-home-portal-migrate
   namespace: bstein-dev-home
+  annotations:
+    kustomize.toolkit.fluxcd.io/force: "true"
 spec:
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml
index 472cf5f5..3528f9be 100644
--- a/services/maintenance/ariadne-migrate-job.yaml
+++ b/services/maintenance/ariadne-migrate-job.yaml
@@ -4,6 +4,8 @@ kind: Job
 metadata:
   name: ariadne-migrate
   namespace: maintenance
+  annotations:
+    kustomize.toolkit.fluxcd.io/force: "true"
 spec:
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600

From d286950b6d36d09c0c255c0419c6bea7ef2045df Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:47:16 +0000
Subject: [PATCH 135/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 7c431b29..3075a664 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From d255483c81eb4d3398317182b22286695b70550e Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 18:48:16 +0000
Subject: [PATCH 136/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 3075a664..c03f2c76 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From c2e34bfaa003ad90734677e07fd90d3319b4a68e Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 21:00:29 +0000
Subject: [PATCH 137/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index c03f2c76..38b7c40c 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 47e96bf45a8ca4c63e80045d9c18c02981252906 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 21:00:34 +0000
Subject: [PATCH 138/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 38b7c40c..4eaed54c 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-121 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 47f77b3a3ce2f9e060d8f42c55b74d514131afd1 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 21:02:01 +0000
Subject: [PATCH 139/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 992c8890..2de807e9 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-35 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-37 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 6c5a6c030d2231c25c4a31eda8b077bc2bac0abe Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 17:58:53 -0300
Subject: [PATCH 140/416] jenkins: set timezone to America/Chicago

---
 services/jenkins/deployment.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/jenkins/deployment.yaml b/services/jenkins/deployment.yaml
index 0dc76afd..63f722bd 100644
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@@ -108,7 +108,9 @@ spec:
               containerPort: 50000
           env:
             - name: JAVA_OPTS
-              value: "-Xms512m -Xmx2048m"
+              value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago"
+            - name: TZ
+              value: "America/Chicago"
             - name: JENKINS_OPTS
               value: "--webroot=/var/jenkins_cache/war"
             - name: JENKINS_SLAVE_AGENT_PORT

From 8b8766b0f090545313b3daf4e1eb4acda5eaeaae Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Thu, 22 Jan 2026 18:23:17 -0300
Subject: [PATCH 141/416] monitoring: add postgres metrics and update overview

---
 infrastructure/postgres/service.yaml          |  8 ++++
 infrastructure/postgres/statefulset.yaml      | 17 ++++++++
 scripts/dashboards_render_atlas.py            | 16 +++----
 .../monitoring/dashboards/atlas-overview.json | 42 ++++++++++---------
 .../grafana-dashboard-overview.yaml           | 42 ++++++++++---------
 5 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/infrastructure/postgres/service.yaml b/infrastructure/postgres/service.yaml
index 3dcab3c2..b695045f 100644
--- a/infrastructure/postgres/service.yaml
+++ b/infrastructure/postgres/service.yaml
@@ -4,6 +4,10 @@ kind: Service
 metadata:
   name: postgres-service
   namespace: postgres
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "9187"
+    prometheus.io/path: "/metrics"
 spec:
   clusterIP: None
   ports:
@@ -11,5 +15,9 @@ spec:
       port: 5432
       protocol: TCP
       targetPort: 5432
+    - name: metrics
+      port: 9187
+      protocol: TCP
+      targetPort: 9187
   selector:
     app: postgres
diff --git a/infrastructure/postgres/statefulset.yaml b/infrastructure/postgres/statefulset.yaml
index e1a19214..2c792486 100644
--- a/infrastructure/postgres/statefulset.yaml
+++ b/infrastructure/postgres/statefulset.yaml
@@ -58,6 +58,23 @@ spec:
             - name: vault-secrets
               mountPath: /mnt/vault
               readOnly: true
+        - name: postgres-exporter
+          image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
+          ports:
+            - name: metrics
+              containerPort: 9187
+              protocol: TCP
+          env:
+            - name: DATA_SOURCE_URI
+              value: "localhost:5432/postgres?sslmode=disable"
+            - name: DATA_SOURCE_USER
+              value: postgres
+            - name: DATA_SOURCE_PASS_FILE
+              value: /mnt/vault/postgres_password
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /mnt/vault
+              readOnly: true
       volumes:
         - name: vault-secrets
           csi:
diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index f55896ad..11479d9d 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -371,8 +371,9 @@ ARIADNE_TEST_SUCCESS_RATE = (
 ARIADNE_TEST_FAILURES_24H = (
     'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
 )
-POSTGRES_CONN_USED_PCT = (
-    "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)"
+POSTGRES_CONN_USED = (
+    'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
+    'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
 )
 POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
 ONEOFF_JOB_OWNER = (
@@ -1138,14 +1139,15 @@ def build_overview():
         )
     )
     panels.append(
-        gauge_panel(
+        stat_panel(
             34,
             "Postgres Connections Used",
-            POSTGRES_CONN_USED_PCT,
+            POSTGRES_CONN_USED,
             {"h": 3, "w": 4, "x": 16, "y": 8},
-            min_value=0,
-            max_value=100,
-            thresholds=PERCENT_THRESHOLDS,
+            decimals=0,
+            text_mode="name_and_value",
+            legend="{{conn}}",
+            instant=True,
         )
     )
     panels.append(
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 93a2d803..2d7f3e51 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1112,7 +1112,7 @@
     },
     {
       "id": 34,
-      "type": "gauge",
+      "type": "stat",
       "title": "Postgres Connections Used",
       "datasource": {
         "type": "prometheus",
@@ -1126,39 +1126,43 @@
       },
       "targets": [
         {
-          "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)",
-          "refId": "A"
+          "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")",
+          "refId": "A",
+          "legendFormat": "{{conn}}",
+          "instant": true
         }
       ],
       "fieldConfig": {
         "defaults": {
-          "min": 0,
-          "max": 100,
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
           "thresholds": {
             "mode": "absolute",
             "steps": [
               {
-                "color": "green",
+                "color": "rgba(115, 115, 115, 1)",
                 "value": null
               },
               {
-                "color": "yellow",
-                "value": 50
-              },
-              {
-                "color": "orange",
-                "value": 75
-              },
-              {
-                "color": "red",
-                "value": 91.5
+                "color": "green",
+                "value": 1
               }
             ]
-          }
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 0
         },
         "overrides": []
       },
       "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
         "reduceOptions": {
           "calcs": [
             "lastNotNull"
@@ -1166,9 +1170,7 @@
           "fields": "",
           "values": false
         },
-        "orientation": "auto",
-        "showThresholdMarkers": false,
-        "showThresholdLabels": false
+        "textMode": "name_and_value"
       }
     },
     {
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 0e9526ef..53361345 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1121,7 +1121,7 @@ data:
         },
         {
           "id": 34,
-          "type": "gauge",
+          "type": "stat",
           "title": "Postgres Connections Used",
           "datasource": {
             "type": "prometheus",
@@ -1135,39 +1135,43 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * sum(pg_stat_activity_count) / clamp_min(max(pg_settings_max_connections), 1)",
-              "refId": "A"
+              "expr": "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")",
+              "refId": "A",
+              "legendFormat": "{{conn}}",
+              "instant": true
             }
           ],
           "fieldConfig": {
             "defaults": {
-              "min": 0,
-              "max": 100,
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
               "thresholds": {
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
+                    "color": "rgba(115, 115, 115, 1)",
                     "value": null
                   },
                   {
-                    "color": "yellow",
-                    "value": 50
-                  },
-                  {
-                    "color": "orange",
-                    "value": 75
-                  },
-                  {
-                    "color": "red",
-                    "value": 91.5
+                    "color": "green",
+                    "value": 1
                   }
                 ]
-              }
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 0
             },
             "overrides": []
           },
           "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
             "reduceOptions": {
               "calcs": [
                 "lastNotNull"
@@ -1175,9 +1179,7 @@ data:
               "fields": "",
               "values": false
             },
-            "orientation": "auto",
-            "showThresholdMarkers": false,
-            "showThresholdLabels": false
+            "textMode": "name_and_value"
           }
         },
         {

From 4f76e7879c7e426230969360187e002f500eee1b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 21:41:04 +0000
Subject: [PATCH 142/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 2de807e9..6f5b7dcb 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-37 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-38 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 9833c839da32e31077bb9f0d90dad089f53ceaa4 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 21:51:32 +0000
Subject: [PATCH 143/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 4eaed54c..cebb191a 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From bf5b6a0cc40c1de0bd3bffaa34bd3637c709ca78 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 21:53:32 +0000
Subject: [PATCH 144/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index cebb191a..3ff70ab0 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-122 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 761f4388ffb2584302e4a80461a0e96b9379cedc Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 22:08:33 +0000
Subject: [PATCH 145/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 3ff70ab0..4e811e0a 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 0249122ed1c7303b2134ba330c80bb876ad201c8 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 22:08:37 +0000
Subject: [PATCH 146/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 4e811e0a..7dbfa1c5 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 8024efd9addceef7c8adaf60444a35461c75619d Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 22:16:34 +0000
Subject: [PATCH 147/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 7dbfa1c5..200ee58d 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From d0b9e6dbd4183401e1a79b3f492036f24e31d7e8 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Thu, 22 Jan 2026 22:16:37 +0000
Subject: [PATCH 148/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 200ee58d..d4f2e028 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-124 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From a4b35bc3bc79bb5b6adcb0ddbe5d506fd8671f13 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:07:49 +0000
Subject: [PATCH 149/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index d4f2e028..459c63dd 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 9bb9bd9a09659e02d821d49f272b7af56bc35f42 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:08:49 +0000
Subject: [PATCH 150/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 459c63dd..d9fa7c01 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-125 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 780c7d450300ff47bdc63d6a92ea4ec33789222d Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:12:49 +0000
Subject: [PATCH 151/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index d9fa7c01..f651a921 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 130323cae1514086545b00a369b36a7f113685df Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:14:49 +0000
Subject: [PATCH 152/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f651a921..78f1cae8 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-126 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 24e837ef729799eb9b0bccc1bc97a2e572ba589b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:32:51 +0000
Subject: [PATCH 153/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 78f1cae8..ae77c9af 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-128 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 0780e5a15531d4df065d8f11a3aa07b705cb43ee Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:42:52 +0000
Subject: [PATCH 154/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index ae77c9af..26b85365 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-128 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-129 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From b598e5baa29228ebfa59c0790fca7c8cc788c158 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:51:53 +0000
Subject: [PATCH 155/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 26b85365..48f5bf7d 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-129 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 100add5544c981696a122911dbfc442ad9bf515a Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 01:52:53 +0000
Subject: [PATCH 156/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 48f5bf7d..b5f5319a 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-127 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 60a705b94c0ad5a4fe6ed8c775f05f8250cb2ffe Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 02:46:57 +0000
Subject: [PATCH 157/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index b5f5319a..d2512be1 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From f54b15cdb975cba4c19f37d1b5699380650bb5fb Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 02:47:58 +0000
Subject: [PATCH 158/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index d2512be1..f36c3178 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-130 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 0942905a4d24841dfe2f5ef54a2b960af13c5f5f Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 03:01:59 +0000
Subject: [PATCH 159/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f36c3178..912cd1f2 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 29ed8defa87177271fac2490ab615a3cc69b2b91 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 03:02:59 +0000
Subject: [PATCH 160/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 912cd1f2..8b47e2ed 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-131 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 4099563792738054a5626a396553bff344a542c9 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 03:10:59 +0000
Subject: [PATCH 161/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 8b47e2ed..c83d9f3e 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From eb9fe085e5207d4a99a61b7d497eeb243f0df14b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 03:11:03 +0000
Subject: [PATCH 162/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index c83d9f3e..81931f2a 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-132 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From e20d3b8fa808500683b1db8c25b0890d264af617 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 03:38:02 +0000
Subject: [PATCH 163/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 81931f2a..aab9154d 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 0b4f094db47667da202ca796c23a348368da6c9c Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 03:39:02 +0000
Subject: [PATCH 164/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index aab9154d..45a2d815 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-133 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 6b83dc4729d66182390928895bb7b19541aab48a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 01:35:15 -0300
Subject: [PATCH 165/416] comms: enable MSC4108 rendezvous in synapse

---
 services/comms/helmrelease.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml
index 4456348c..e6536fa4 100644
--- a/services/comms/helmrelease.yaml
+++ b/services/comms/helmrelease.yaml
@@ -138,6 +138,8 @@ spec:
       auto_join_rooms:
         - "#othrys:live.bstein.dev"
       autocreate_auto_join_rooms: true
+      experimental:
+        msc4108_enabled: true
       default_room_version: "11"
       experimental_features:
         msc3266_enabled: true

From 9ded5a75f06e5120b31486453a4023c4350474fc Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 01:46:03 -0300
Subject: [PATCH 166/416] comms: enable MSC4108 under experimental_features

---
 services/comms/helmrelease.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/services/comms/helmrelease.yaml b/services/comms/helmrelease.yaml
index e6536fa4..eeac49e8 100644
--- a/services/comms/helmrelease.yaml
+++ b/services/comms/helmrelease.yaml
@@ -138,10 +138,9 @@ spec:
       auto_join_rooms:
         - "#othrys:live.bstein.dev"
       autocreate_auto_join_rooms: true
-      experimental:
-        msc4108_enabled: true
       default_room_version: "11"
       experimental_features:
+        msc4108_enabled: true
         msc3266_enabled: true
         msc4143_enabled: true
         msc4222_enabled: true

From 8bd58f703443c0808dded7afbdeb3fbd41d63e11 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 02:04:51 -0300
Subject: [PATCH 167/416] comms/keycloak: add mailu email claim

---
 services/comms/mas-configmap.yaml         |  2 +-
 services/keycloak/realm-settings-job.yaml | 47 +++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/services/comms/mas-configmap.yaml b/services/comms/mas-configmap.yaml
index 5e6cfdd1..9d2c11ea 100644
--- a/services/comms/mas-configmap.yaml
+++ b/services/comms/mas-configmap.yaml
@@ -72,7 +72,7 @@ data:
               template: "{{ user.name }}"
             email:
               action: force
-              template: "{{ user.email }}"
+              template: "{{ user.mailu_email }}"
 
     policy:
       data:
diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index 6e6589de..e94076c6 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -542,6 +542,53 @@ spec:
                           if status not in (201, 204):
                               raise SystemExit(f"Unexpected mailu email mapper create response: {status}")
 
+                      mailu_claim_mapper = {
+                          "name": "mailu-email-claim",
+                          "protocol": "openid-connect",
+                          "protocolMapper": "oidc-usermodel-attribute-mapper",
+                          "consentRequired": False,
+                          "config": {
+                              "user.attribute": "mailu_email",
+                              "claim.name": "mailu_email",
+                              "jsonType.label": "String",
+                              "id.token.claim": "true",
+                              "access.token.claim": "true",
+                              "userinfo.token.claim": "true",
+                              "multivalued": "false",
+                              "aggregate.attrs": "false",
+                          },
+                      }
+                      status, mappers = http_json(
+                          "GET",
+                          f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                          access_token,
+                      )
+                      existing_claim = None
+                      if status == 200 and isinstance(mappers, list):
+                          for item in mappers:
+                              if isinstance(item, dict) and item.get("name") == mailu_claim_mapper["name"]:
+                                  existing_claim = item
+                                  break
+                      if existing_claim and existing_claim.get("id"):
+                          mailu_claim_mapper["id"] = existing_claim["id"]
+                          status, _ = http_json(
+                              "PUT",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models/{existing_claim['id']}",
+                              access_token,
+                              mailu_claim_mapper,
+                          )
+                          if status not in (200, 204):
+                              raise SystemExit(f"Unexpected mailu email claim mapper update response: {status}")
+                      else:
+                          status, _ = http_json(
+                              "POST",
+                              f"{base_url}/admin/realms/{realm}/clients/{client_id}/protocol-mappers/models",
+                              access_token,
+                              mailu_claim_mapper,
+                          )
+                          if status not in (201, 204):
+                              raise SystemExit(f"Unexpected mailu email claim mapper create response: {status}")
+
               # Ensure MFA is on by default for newly-created users.
               status, required_actions = http_json(
                   "GET",

From 4594255cb2a4d0e6edfcfa44b8af689b7d7b0c62 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 02:09:53 -0300
Subject: [PATCH 168/416] keycloak: bump realm settings job

---
 services/keycloak/realm-settings-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index e94076c6..0de48d1e 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-34
+  name: keycloak-realm-settings-35
   namespace: sso
 spec:
   backoffLimit: 0

From 7ebbcdb914b9359576c7721bb31c52fddc5653eb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 03:11:42 -0300
Subject: [PATCH 169/416] portal: bump migrate job name

---
 services/bstein-dev-home/portal-migrate-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/portal-migrate-job.yaml
index a578b8c8..2cb2a12e 100644
--- a/services/bstein-dev-home/portal-migrate-job.yaml
+++ b/services/bstein-dev-home/portal-migrate-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: bstein-dev-home-portal-migrate
+  name: bstein-dev-home-portal-migrate-36
   namespace: bstein-dev-home
   annotations:
     kustomize.toolkit.fluxcd.io/force: "true"

From 21ee6cee79876d251318c77698cba1e43a985c49 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 06:13:15 +0000
Subject: [PATCH 170/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 45a2d815..41ad3e5b 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From c929e9499d0c9d050b233f2847240f3a78d9a3a7 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 06:14:16 +0000
Subject: [PATCH 171/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 41ad3e5b..ea326a22 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -23,7 +23,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-134 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 8da007759bf866b4e67ea336bb61751dc11e4f19 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 03:28:26 -0300
Subject: [PATCH 172/416] bstein-dev-home: separate portal migrations

---
 .../kustomization.yaml                           | 16 ++++++++++++++++
 .../flux-system/applications/kustomization.yaml  |  1 +
 services/bstein-dev-home/kustomization.yaml      |  1 -
 .../migrations/kustomization.yaml                |  6 ++++++
 .../{ => migrations}/portal-migrate-job.yaml     |  2 +-
 5 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
 create mode 100644 services/bstein-dev-home/migrations/kustomization.yaml
 rename services/bstein-dev-home/{ => migrations}/portal-migrate-job.yaml (95%)

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
new file mode 100644
index 00000000..f962de0a
--- /dev/null
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
@@ -0,0 +1,16 @@
+# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: bstein-dev-home-migrations
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/bstein-dev-home/migrations
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  targetNamespace: bstein-dev-home
+  wait: false
+  suspend: true
diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml
index 417a3ec3..10c203d8 100644
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@@ -12,6 +12,7 @@ resources:
   - pegasus/image-automation.yaml
   - bstein-dev-home/kustomization.yaml
   - bstein-dev-home/image-automation.yaml
+  - bstein-dev-home-migrations/kustomization.yaml
   - harbor/kustomization.yaml
   - harbor/image-automation.yaml
   - jellyfin/kustomization.yaml
diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index ea326a22..e6a744f7 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -15,7 +15,6 @@ resources:
   - frontend-service.yaml
   - backend-deployment.yaml
   - backend-service.yaml
-  - portal-migrate-job.yaml
   - vaultwarden-cred-sync-cronjob.yaml
   - portal-onboarding-e2e-test-job.yaml
   - ingress.yaml
diff --git a/services/bstein-dev-home/migrations/kustomization.yaml b/services/bstein-dev-home/migrations/kustomization.yaml
new file mode 100644
index 00000000..067665bc
--- /dev/null
+++ b/services/bstein-dev-home/migrations/kustomization.yaml
@@ -0,0 +1,6 @@
+# services/bstein-dev-home/migrations/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: bstein-dev-home
+resources:
+  - portal-migrate-job.yaml
diff --git a/services/bstein-dev-home/portal-migrate-job.yaml b/services/bstein-dev-home/migrations/portal-migrate-job.yaml
similarity index 95%
rename from services/bstein-dev-home/portal-migrate-job.yaml
rename to services/bstein-dev-home/migrations/portal-migrate-job.yaml
index 2cb2a12e..9d052546 100644
--- a/services/bstein-dev-home/portal-migrate-job.yaml
+++ b/services/bstein-dev-home/migrations/portal-migrate-job.yaml
@@ -1,4 +1,4 @@
-# services/bstein-dev-home/portal-migrate-job.yaml
+# services/bstein-dev-home/migrations/portal-migrate-job.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:

From 4984147fac48773cbaf34f16a348f349ebaad757 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 06:44:18 +0000
Subject: [PATCH 173/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index e6a744f7..f705c4e7 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From c373b953c27aa7e480dc3302dacd19765ed766d1 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 06:45:19 +0000
Subject: [PATCH 174/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f705c4e7..94239e33 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-135 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 993702afeedb71ac45793adadb4fc03bd04bc500 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 11:50:55 -0300
Subject: [PATCH 175/416] monitoring: alert on VM outage

---
 .../vault-csi/secrets-store-csi-driver.yaml   |  3 +-
 .../monitoring/grafana-alerting-config.yaml   | 53 +++++++++++++++++++
 services/monitoring/helmrelease.yaml          |  2 +-
 3 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/infrastructure/vault-csi/secrets-store-csi-driver.yaml b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
index 0b249fc9..0004c0d5 100644
--- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml
+++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
@@ -17,4 +17,5 @@ spec:
   values:
     syncSecret:
       enabled: true
-    enableSecretRotation: false
+    enableSecretRotation: true
+    rotationPollInterval: 2m
diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml
index daa1e29a..8713d3db 100644
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@@ -180,6 +180,59 @@ data:
               summary: "{{ $labels.instance }} CPU >90% for 10m"
             labels:
               severity: warning
+      - orgId: 1
+        name: atlas-metrics
+        folder: Alerts
+        interval: 1m
+        rules:
+          - uid: victoria-metrics-down
+            title: "VictoriaMetrics unavailable (>30m)"
+            condition: C
+            for: "30m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 600
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  expr: sum(up{job="victoriametrics"})
+                  legendFormat: victoriametrics
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: lt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: Alerting
+            execErrState: Alerting
+            annotations:
+              summary: "VictoriaMetrics is unavailable for >30m"
+            labels:
+              severity: critical
       - orgId: 1
         name: maintenance
         folder: Alerts
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index ac24f8a0..8e225d49 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -342,7 +342,7 @@ spec:
       GF_SMTP_HOST: "mail.bstein.dev:587"
       GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
       GF_SMTP_FROM_NAME: "Atlas Grafana"
-      GRAFANA_ALERT_EMAILS: "alerts@bstein.dev"
+      GRAFANA_ALERT_EMAILS: "brad@bstein.dev"
       GF_SECURITY_ALLOW_EMBEDDING: "true"
       GF_AUTH_GENERIC_OAUTH_ENABLED: "true"
       GF_AUTH_GENERIC_OAUTH_NAME: "Keycloak"

From af112d9dfa1ae27f2a7402cdf509146ce0b83484 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 14:07:52 -0300
Subject: [PATCH 176/416] finance: allow actual user creation

---
 services/finance/actual-budget-deployment.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/services/finance/actual-budget-deployment.yaml b/services/finance/actual-budget-deployment.yaml
index 55186b23..637e9ae1 100644
--- a/services/finance/actual-budget-deployment.yaml
+++ b/services/finance/actual-budget-deployment.yaml
@@ -90,6 +90,8 @@ spec:
               value: openid
             - name: ACTUAL_MULTIUSER
               value: "true"
+            - name: ACTUAL_USER_CREATION_MODE
+              value: login
             - name: ACTUAL_OPENID_DISCOVERY_URL
               value: https://sso.bstein.dev/realms/atlas
             - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT
@@ -128,6 +130,8 @@ spec:
               value: openid
             - name: ACTUAL_MULTIUSER
               value: "true"
+            - name: ACTUAL_USER_CREATION_MODE
+              value: login
             - name: ACTUAL_OPENID_DISCOVERY_URL
               value: https://sso.bstein.dev/realms/atlas
             - name: ACTUAL_OPENID_AUTHORIZATION_ENDPOINT

From c47fc2dcb8b5f83bcf97e4de0aa60bd6fce2ef75 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 19:11:58 +0000
Subject: [PATCH 177/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 94239e33..5d2a1fd3 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From c1e869785f8c5b5fb287cb7ef6f7dea3e729b6ac Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 19:13:56 +0000
Subject: [PATCH 178/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 5d2a1fd3..23381a0d 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-136 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From e55a4ee5950ea539953dc6ef71e3cc8d183af2fe Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 19:56:31 +0000
Subject: [PATCH 179/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 6f5b7dcb..617b715b 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-38 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-39 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 199a6fbac068d7b6ed1afa9048a6ff1be2c3835a Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 19:58:00 +0000
Subject: [PATCH 180/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 23381a0d..4007b7d0 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 512950094dd09457884821cc64690a1f7ad98677 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 20:00:01 +0000
Subject: [PATCH 181/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 4007b7d0..e43647c6 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-137 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 46ee074929cb8ba8a369a3b794b2d182699857ba Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 17:21:18 -0300
Subject: [PATCH 182/416] maintenance: rotate ariadne migrate job name

---
 services/maintenance/ariadne-migrate-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml
index 3528f9be..b9b1496f 100644
--- a/services/maintenance/ariadne-migrate-job.yaml
+++ b/services/maintenance/ariadne-migrate-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: ariadne-migrate
+  name: ariadne-migrate-2
   namespace: maintenance
   annotations:
     kustomize.toolkit.fluxcd.io/force: "true"

From 9c75e3973a164dac102909d870405b0fb8f3d9da Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 20:47:05 +0000
Subject: [PATCH 183/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index e43647c6..1642cbe6 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 00d76289f82e73497da0d3cfb6c4e467b41f5568 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 20:48:05 +0000
Subject: [PATCH 184/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 1642cbe6..9f989fd8 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-139 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 15a47f71be1e39385f9c0a2925a7b06d9fec667f Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 20:50:05 +0000
Subject: [PATCH 185/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 9f989fd8..b11cb44b 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 30e7833b6c0540539a04015cd598cc47f39fe067 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 20:51:05 +0000
Subject: [PATCH 186/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index b11cb44b..0039328a 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-142 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 2ef49f76b8b93a416a197c8b196e941d17c6027b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 21:27:08 +0000
Subject: [PATCH 187/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 0039328a..a5482c04 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 957b1ef0a59bbbfe1cdc0206028853261dbbdf1b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 21:28:08 +0000
Subject: [PATCH 188/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index a5482c04..17186035 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-143 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 2ed441ac7458da6261729afd2562639fa2ae61eb Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 21:44:40 +0000
Subject: [PATCH 189/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 617b715b..09636067 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-39 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-43 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 0db28faf32f3d89af629d365c7e024e8ba898ed7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 18:58:14 -0300
Subject: [PATCH 190/416] flux: force apply migrations

---
 .../applications/bstein-dev-home-migrations/kustomization.yaml   | 1 +
 .../atlas/flux-system/platform/maintenance/kustomization.yaml    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
index f962de0a..da61b2d1 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
@@ -8,6 +8,7 @@ spec:
   interval: 10m
   path: ./services/bstein-dev-home/migrations
   prune: true
+  force: true
   sourceRef:
     kind: GitRepository
     name: flux-system
diff --git a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
index fc655a4f..8477ec98 100644
--- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
@@ -8,6 +8,7 @@ spec:
   interval: 10m
   path: ./services/maintenance
   prune: true
+  force: true
   sourceRef:
     kind: GitRepository
     name: flux-system

From 25b123703decc15b93d6cb457ee0b313f4542585 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 22:21:43 +0000
Subject: [PATCH 191/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 09636067..18d0008a 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-43 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-44 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 65ba47d6c2c5b7c2c5a7df5b02c5175f1494b696 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 22:24:13 +0000
Subject: [PATCH 192/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 17186035..487fa644 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From f8d257bff89271fb96534ab4b4441131816c146b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 22:25:15 +0000
Subject: [PATCH 193/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 487fa644..a3914b53 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-144 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 60faa8c74aa55e0e2987541faf876afd7f41d567 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 22:39:15 +0000
Subject: [PATCH 194/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index a3914b53..a58bea72 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From f243fff94f168132fd5474ee4c436bbdb506de44 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 22:40:15 +0000
Subject: [PATCH 195/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index a58bea72..ab69f05e 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-145 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From dc1fccd687ea3af67300850fe8dc05907ad761ab Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 23:19:18 +0000
Subject: [PATCH 196/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index ab69f05e..2fe7ad22 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 979882c8180eb2b37e456736340ff936be003fea Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 23:19:21 +0000
Subject: [PATCH 197/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 2fe7ad22..06829f6e 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-146 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From e232e1868573f4afc0686777ce4d4d2b9b7299d7 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 23:28:20 +0000
Subject: [PATCH 198/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 06829f6e..655cfaeb 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 547c15748aaaf7e371ae072a33bdef2e4a69b683 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 23:28:28 +0000
Subject: [PATCH 199/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 655cfaeb..3370bb17 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-147 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 362b4a4b5b97ba68cc4b5478b37fb22ed1d3e3ff Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 23:52:21 +0000
Subject: [PATCH 200/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 3370bb17..9c95b907 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 994bc02f2cf344f16a3bbbc5981c06ab9fefaa6d Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Fri, 23 Jan 2026 23:53:21 +0000
Subject: [PATCH 201/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 9c95b907..0fa46113 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-148 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From cf779aa1968c56b3d4be7f94287af1c41c409393 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 22:30:50 -0300
Subject: [PATCH 202/416] keycloak: add vaultwarden_grandfathered flag

---
 services/keycloak/realm-settings-job.yaml    | 1 +
 services/maintenance/ariadne-deployment.yaml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index 0de48d1e..74f569b7 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -333,6 +333,7 @@ spec:
               ensure_group("admin")
               ensure_group("demo")
               ensure_group("test")
+              ensure_group("vaultwarden_grandfathered")
               planka_group = ensure_group("planka-users")
 
               if planka_group and planka_group.get("id"):
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 581947c6..52d10f96 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -150,7 +150,7 @@ spec:
             - name: ACCOUNT_ALLOWED_GROUPS
               value: dev,admin
             - name: ALLOWED_FLAG_GROUPS
-              value: demo,test
+              value: demo,test,vaultwarden_grandfathered
             - name: DEFAULT_USER_GROUPS
               value: dev
             - name: MAILU_DOMAIN

From 7b1d198f1d074d39a28ef2cf0889f77c06d0defc Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 01:33:29 +0000
Subject: [PATCH 203/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 0fa46113..550a7a8f 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 1fd295f7817b43c9e3accd580c6d3b877bf1640c Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 01:33:33 +0000
Subject: [PATCH 204/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 550a7a8f..efed9a33 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-149 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From d3d534d4f870dccf97bb6d0a0a61fc914cdd26fd Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Fri, 23 Jan 2026 22:41:20 -0300
Subject: [PATCH 205/416] keycloak: rerun realm settings job

---
 services/keycloak/realm-settings-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/realm-settings-job.yaml
index 74f569b7..9265ca3e 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/realm-settings-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-35
+  name: keycloak-realm-settings-36
   namespace: sso
 spec:
   backoffLimit: 0

From 6492e64a03ec4c48406ee2e4dd274b276a5f9c95 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 02:05:32 +0000
Subject: [PATCH 206/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index efed9a33..f38bd96c 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From ef1e3955a76a0588fea3b2da5d80a07fd1da85bb Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 02:07:32 +0000
Subject: [PATCH 207/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f38bd96c..276c82f2 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-150 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 5ed46c0ec8cde7dc9b81c4d5aa063df6f84e7898 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 09:29:39 +0000
Subject: [PATCH 208/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 18d0008a..b3516152 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-44 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-47 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From fbf171d026a2a5b9aa1bb873ac8e5d7ccca1837e Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 10:13:43 +0000
Subject: [PATCH 209/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index b3516152..4e261cbf 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-47 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-48 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 1ca9c54aed4f61a85922f42062c8cd9e680dcb25 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 10:15:15 +0000
Subject: [PATCH 210/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 276c82f2..d7cbaf7d 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From e50ee85fda2b8bdeda094d6d8fde9d39f18c9c74 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 10:16:15 +0000
Subject: [PATCH 211/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index d7cbaf7d..cab14d70 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-151 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From ae2a031fabeaba35f7eed14bcb8d6cdf5e89bc88 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 14:31:37 +0000
Subject: [PATCH 212/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index cab14d70..fad85342 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From f91d3d97dd4708c9e3c6343f60f95514085e8ac6 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 14:32:37 +0000
Subject: [PATCH 213/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index fad85342..4b21d1e2 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-152 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From d5b1e77afeb7a91de767559d6744057040d429d3 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 14:44:38 +0000
Subject: [PATCH 214/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 4b21d1e2..60db96aa 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 04c83fe98e858d9f6e4a5ca3e42b86c99a19d86d Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sat, 24 Jan 2026 14:46:38 +0000
Subject: [PATCH 215/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 60db96aa..9d34348b 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-153 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From cb6c77bc740defda863c427badebd36f38df46ad Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sat, 24 Jan 2026 14:16:36 -0300
Subject: [PATCH 216/416] vaultwarden: bump to 1.35.2

---
 knowledge/catalog/atlas.json                | 2 +-
 knowledge/catalog/atlas.yaml                | 2 +-
 services/comms/knowledge/catalog/atlas.json | 2 +-
 services/comms/knowledge/catalog/atlas.yaml | 2 +-
 services/vaultwarden/deployment.yaml        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json
index 0d97bcd6..18cb6b64 100644
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
@@ -998,7 +998,7 @@
       "serviceAccountName": null,
       "nodeSelector": {},
       "images": [
-        "vaultwarden/server:1.33.2"
+        "vaultwarden/server:1.35.2"
       ]
     }
   ],
diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml
index f3e04a84..580a331b 100644
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
@@ -672,7 +672,7 @@ workloads:
   serviceAccountName: null
   nodeSelector: {}
   images:
-  - vaultwarden/server:1.33.2
+  - vaultwarden/server:1.35.2
 services:
 - namespace: ai
   name: ollama
diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json
index 0d97bcd6..18cb6b64 100644
--- a/services/comms/knowledge/catalog/atlas.json
+++ b/services/comms/knowledge/catalog/atlas.json
@@ -998,7 +998,7 @@
       "serviceAccountName": null,
       "nodeSelector": {},
       "images": [
-        "vaultwarden/server:1.33.2"
+        "vaultwarden/server:1.35.2"
       ]
     }
   ],
diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml
index 6529e1a4..67f2fcb2 100644
--- a/services/comms/knowledge/catalog/atlas.yaml
+++ b/services/comms/knowledge/catalog/atlas.yaml
@@ -672,7 +672,7 @@ workloads:
   serviceAccountName: null
   nodeSelector: {}
   images:
-  - vaultwarden/server:1.33.2
+  - vaultwarden/server:1.35.2
 services:
 - namespace: ai
   name: ollama
diff --git a/services/vaultwarden/deployment.yaml b/services/vaultwarden/deployment.yaml
index 2893a924..e1d888a8 100644
--- a/services/vaultwarden/deployment.yaml
+++ b/services/vaultwarden/deployment.yaml
@@ -39,7 +39,7 @@ spec:
         node-role.kubernetes.io/worker: "true"
       containers:
         - name: vaultwarden
-          image: vaultwarden/server:1.33.2
+          image: vaultwarden/server:1.35.2
           command: ["/bin/sh", "-c"]
           args:
             - >-

From b18d0d40bc9c5115150d1d0f219b323de0431405 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sun, 25 Jan 2026 00:06:26 +0000
Subject: [PATCH 217/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 9d34348b..63eaebf2 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 531e573c004ff06c522ae961359cb5da3ff1e2a2 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sun, 25 Jan 2026 00:07:26 +0000
Subject: [PATCH 218/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 63eaebf2..1511f5c6 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-154 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 7fec7d4fd12cd02e12855f63a6020d4b185b490b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sun, 25 Jan 2026 17:39:57 +0000
Subject: [PATCH 219/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 1511f5c6..7ed1b524 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 936d599c6ec5e0ed439ed4709d4f2242376b0b89 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sun, 25 Jan 2026 17:40:57 +0000
Subject: [PATCH 220/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 7ed1b524..0890f593 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-155 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 7e004efe65a8d82a4705bb26c63c7c73b26a2323 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sun, 25 Jan 2026 18:04:59 +0000
Subject: [PATCH 221/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 0890f593..c0aff7fa 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 7c4af51287ad206de6e3beaea16053af3cedcdfe Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Sun, 25 Jan 2026 18:06:59 +0000
Subject: [PATCH 222/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index c0aff7fa..90c3b8de 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-156 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 58267ab522403842a53222b77dbf68aac0b3f5ec Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sun, 25 Jan 2026 15:59:12 -0300
Subject: [PATCH 223/416] comms: route atlasbot to chat gateway

---
 services/bstein-dev-home/chat-ai-gateway-deployment.yaml | 2 +-
 services/comms/atlasbot-deployment.yaml                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
index 3010a9b0..40d74fe1 100644
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@@ -5,7 +5,7 @@ metadata:
   name: chat-ai-gateway
   namespace: bstein-dev-home
 spec:
-  replicas: 0
+  replicas: 1
   revisionHistoryLimit: 2
   selector:
     matchLabels:
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 46180539..278a008f 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -76,7 +76,7 @@ spec:
             - name: BOT_USER
               value: atlasbot
             - name: OLLAMA_URL
-              value: https://chat.ai.bstein.dev/
+              value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
               value: qwen2.5-coder:7b-instruct-q4_0
           resources:

From 712bba23a1ec5c38b092466c83ecd58dc8239f55 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Sun, 25 Jan 2026 16:19:15 -0300
Subject: [PATCH 224/416] ai: restart ollama deployment

---
 services/ai-llm/deployment.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml
index fa354408..dfa1bdd1 100644
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@@ -22,6 +22,7 @@ spec:
       annotations:
         ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
         ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
+        ai.bstein.dev/restartedAt: "2026-01-25T19:10:00Z"
     spec:
       affinity:
         nodeAffinity:

From 08be13fe91e073062a5435af12c3478363e51805 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 00:52:35 -0300
Subject: [PATCH 225/416] comms: normalize atlasbot replies

---
 services/comms/atlasbot-deployment.yaml |  4 ++-
 services/comms/scripts/atlasbot/bot.py  | 34 ++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 278a008f..c2bc108d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-4
+        checksum/atlasbot-configmap: manual-atlasbot-5
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -75,6 +75,8 @@ spec:
               value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
             - name: BOT_USER
               value: atlasbot
+            - name: BOT_MENTIONS
+              value: atlasbot
             - name: OLLAMA_URL
               value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e8bd1a83..3da93ba5 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -71,6 +71,8 @@ METRIC_HINT_WORDS = {
     "latency",
 }
 
+CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL)
+
 def _tokens(text: str) -> list[str]:
     toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
     return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
@@ -442,6 +444,35 @@ def vm_cluster_snapshot() -> str:
         parts.append(pr)
     return "\n".join(parts).strip()
 
+def _strip_code_fence(text: str) -> str:
+    cleaned = (text or "").strip()
+    match = CODE_FENCE_RE.match(cleaned)
+    if match:
+        return match.group(1).strip()
+    return cleaned
+
+def _normalize_reply(value: Any) -> str:
+    if isinstance(value, dict):
+        for key in ("content", "response", "reply", "message"):
+            if key in value:
+                return _normalize_reply(value[key])
+        for v in value.values():
+            if isinstance(v, (str, dict, list)):
+                return _normalize_reply(v)
+        return json.dumps(value, ensure_ascii=False)
+    if isinstance(value, list):
+        parts = [_normalize_reply(item) for item in value]
+        return " ".join(p for p in parts if p)
+    if value is None:
+        return ""
+    text = _strip_code_fence(str(value))
+    if text.startswith("{") and text.endswith("}"):
+        try:
+            return _normalize_reply(json.loads(text))
+        except Exception:
+            return text
+    return text
+
 
 # Conversation state.
 history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (short transcript)
@@ -511,7 +542,8 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
         r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
         with request.urlopen(r, timeout=20) as resp:
             data = json.loads(resp.read().decode())
-            reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help."
+            raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
+            reply = _normalize_reply(raw_reply) or "I'm here to help."
             history[hist_key].append(f"Atlas: {reply}")
             return reply
     except Exception:

From e7f8290807cb1dfc0a60bdb037cf6d968e7e1052 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 01:07:49 -0300
Subject: [PATCH 226/416] comms: answer node count queries

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 33 ++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index c2bc108d..7a258acd 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-5
+        checksum/atlasbot-configmap: manual-atlasbot-6
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 3da93ba5..69c1b84b 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -444,6 +444,28 @@ def vm_cluster_snapshot() -> str:
         parts.append(pr)
     return "\n".join(parts).strip()
 
+def nodes_summary(cluster_name: str) -> str:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return ""
+    items = data.get("items") or []
+    if not isinstance(items, list) or not items:
+        return ""
+    total = len(items)
+    ready = 0
+    for node in items:
+        conditions = node.get("status", {}).get("conditions") or []
+        for cond in conditions if isinstance(conditions, list) else []:
+            if cond.get("type") == "Ready":
+                if cond.get("status") == "True":
+                    ready += 1
+                break
+    not_ready = max(total - ready, 0)
+    if not_ready:
+        return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
+    return f"{cluster_name} cluster has {total} nodes, all Ready."
+
 def _strip_code_fence(text: str) -> str:
     cleaned = (text or "").strip()
     match = CODE_FENCE_RE.match(cleaned)
@@ -526,7 +548,8 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
             "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
             "Be helpful, direct, and concise. "
             "Prefer answering with exact repo paths and Kubernetes resource names. "
-            "Never include or request secret values."
+            "Never include or request secret values. "
+            "Respond in plain sentences; do not return JSON or code fences unless explicitly asked."
         )
         transcript_parts = [system]
         if context:
@@ -601,6 +624,14 @@ def sync_loop(token: str, room_id: str):
                 if not (is_dm or mentioned):
                     continue
 
+                lower_body = body.lower()
+                if re.search(r"\\bhow many nodes\\b|\\bnode count\\b|\\bnumber of nodes\\b", lower_body):
+                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
+                        summary = nodes_summary("Atlas")
+                        if summary:
+                            send_msg(token, rid, summary)
+                            continue
+
                 # Only do live cluster/metrics introspection in DMs.
                 allow_tools = is_dm
 

From 75c8a21b466918ba469b5d27ad374d53de695255 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 01:32:01 -0300
Subject: [PATCH 227/416] comms: fix atlasbot node count matcher

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7a258acd..fe1e9066 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-6
+        checksum/atlasbot-configmap: manual-atlasbot-7
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 69c1b84b..b2ac1c9b 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -625,12 +625,14 @@ def sync_loop(token: str, room_id: str):
                     continue
 
                 lower_body = body.lower()
-                if re.search(r"\\bhow many nodes\\b|\\bnode count\\b|\\bnumber of nodes\\b", lower_body):
+                if re.search(r"\bhow many nodes\b|\bnode count\b|\bnumber of nodes\b", lower_body):
                     if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                         summary = nodes_summary("Atlas")
-                        if summary:
-                            send_msg(token, rid, summary)
+                        if not summary:
+                            send_msg(token, rid, "I couldn’t reach the cluster API to count nodes. Try again in a moment.")
                             continue
+                        send_msg(token, rid, summary)
+                        continue
 
                 # Only do live cluster/metrics introspection in DMs.
                 allow_tools = is_dm

From 9c3328a030d21c41cbcba5b4957693d224932c9d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 01:35:47 -0300
Subject: [PATCH 228/416] comms: answer node name queries

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 29 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index fe1e9066..7aedf4a0 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-7
+        checksum/atlasbot-configmap: manual-atlasbot-8
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index b2ac1c9b..6fb6bff0 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -466,6 +466,27 @@ def nodes_summary(cluster_name: str) -> str:
         return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
     return f"{cluster_name} cluster has {total} nodes, all Ready."
 
+def nodes_names_summary(cluster_name: str) -> str:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return ""
+    items = data.get("items") or []
+    if not isinstance(items, list) or not items:
+        return ""
+    names = []
+    for node in items:
+        name = (node.get("metadata") or {}).get("name") or ""
+        if name:
+            names.append(name)
+    names = sorted(set(names))
+    if not names:
+        return ""
+    if len(names) <= 30:
+        return f"{cluster_name} node names: {', '.join(names)}."
+    shown = ", ".join(names[:30])
+    return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."
+
 def _strip_code_fence(text: str) -> str:
     cleaned = (text or "").strip()
     match = CODE_FENCE_RE.match(cleaned)
@@ -633,6 +654,14 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
+                if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body):
+                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
+                        names_summary = nodes_names_summary("Atlas")
+                        if not names_summary:
+                            send_msg(token, rid, "I couldn’t reach the cluster API to list node names. Try again in a moment.")
+                            continue
+                        send_msg(token, rid, names_summary)
+                        continue
 
                 # Only do live cluster/metrics introspection in DMs.
                 allow_tools = is_dm

From 87db5b2bd2ed7d6aafe0810e6adfcf803b06e905 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 03:32:17 -0300
Subject: [PATCH 229/416] comms: sync atlas knowledge and use ariadne state

---
 knowledge/catalog/atlas-summary.json          |   8 +-
 knowledge/catalog/atlas.json                  | 706 ++++++++++++++++--
 knowledge/catalog/atlas.yaml                  | 494 ++++++++++--
 knowledge/diagrams/atlas-http.mmd             |  43 +-
 scripts/knowledge_render_atlas.py             |  17 +
 services/comms/atlasbot-deployment.yaml       |   4 +-
 .../knowledge/catalog/atlas-summary.json      |   8 +-
 services/comms/knowledge/catalog/atlas.json   | 706 ++++++++++++++++--
 services/comms/knowledge/catalog/atlas.yaml   | 496 ++++++++++--
 .../comms/knowledge/catalog/runbooks.json     |  16 +
 .../comms/knowledge/diagrams/atlas-http.mmd   |  43 +-
 services/comms/knowledge/metis.md             |  26 +
 .../comms/knowledge/runbooks/comms-verify.md  |  30 +
 services/comms/knowledge/software/metis.md    |  73 ++
 services/comms/scripts/atlasbot/bot.py        |  38 +
 services/maintenance/ariadne-deployment.yaml  |   8 +
 services/maintenance/ariadne-rbac.yaml        |  15 +
 17 files changed, 2453 insertions(+), 278 deletions(-)
 create mode 100644 services/comms/knowledge/metis.md
 create mode 100644 services/comms/knowledge/runbooks/comms-verify.md
 create mode 100644 services/comms/knowledge/software/metis.md

diff --git a/knowledge/catalog/atlas-summary.json b/knowledge/catalog/atlas-summary.json
index fa350516..ea825ce7 100644
--- a/knowledge/catalog/atlas-summary.json
+++ b/knowledge/catalog/atlas-summary.json
@@ -1,8 +1,8 @@
 {
   "counts": {
-    "helmrelease_host_hints": 17,
-    "http_endpoints": 37,
-    "services": 43,
-    "workloads": 54
+    "helmrelease_host_hints": 19,
+    "http_endpoints": 45,
+    "services": 47,
+    "workloads": 74
   }
 }
diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json
index 18cb6b64..21ac4073 100644
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
@@ -11,6 +11,21 @@
       "path": "services/bstein-dev-home",
       "targetNamespace": "bstein-dev-home"
     },
+    {
+      "name": "bstein-dev-home-migrations",
+      "path": "services/bstein-dev-home/migrations",
+      "targetNamespace": "bstein-dev-home"
+    },
+    {
+      "name": "cert-manager",
+      "path": "infrastructure/cert-manager",
+      "targetNamespace": "cert-manager"
+    },
+    {
+      "name": "cert-manager-cleanup",
+      "path": "infrastructure/cert-manager/cleanup",
+      "targetNamespace": "cert-manager"
+    },
     {
       "name": "comms",
       "path": "services/comms",
@@ -26,6 +41,11 @@
       "path": "services/crypto",
       "targetNamespace": "crypto"
     },
+    {
+      "name": "finance",
+      "path": "services/finance",
+      "targetNamespace": "finance"
+    },
     {
       "name": "flux-system",
       "path": "clusters/atlas/flux-system",
@@ -46,6 +66,11 @@
       "path": "services/harbor",
       "targetNamespace": "harbor"
     },
+    {
+      "name": "health",
+      "path": "services/health",
+      "targetNamespace": "health"
+    },
     {
       "name": "helm",
       "path": "infrastructure/sources/helm",
@@ -71,6 +96,16 @@
       "path": "services/logging",
       "targetNamespace": null
     },
+    {
+      "name": "longhorn",
+      "path": "infrastructure/longhorn/core",
+      "targetNamespace": "longhorn-system"
+    },
+    {
+      "name": "longhorn-adopt",
+      "path": "infrastructure/longhorn/adopt",
+      "targetNamespace": "longhorn-system"
+    },
     {
       "name": "longhorn-ui",
       "path": "infrastructure/longhorn/ui-ingress",
@@ -161,11 +196,21 @@
       "path": "infrastructure/vault-csi",
       "targetNamespace": "kube-system"
     },
+    {
+      "name": "vault-injector",
+      "path": "infrastructure/vault-injector",
+      "targetNamespace": "vault"
+    },
     {
       "name": "vaultwarden",
       "path": "services/vaultwarden",
       "targetNamespace": "vaultwarden"
     },
+    {
+      "name": "wallet-monero-temp",
+      "path": "services/crypto/wallet-monero-temp",
+      "targetNamespace": "crypto"
+    },
     {
       "name": "xmr-miner",
       "path": "services/crypto/xmr-miner",
@@ -199,7 +244,7 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157"
       ]
     },
     {
@@ -215,7 +260,20 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "bstein-dev-home",
+      "name": "bstein-dev-home-vault-sync",
+      "labels": {
+        "app": "bstein-dev-home-vault-sync"
+      },
+      "serviceAccountName": "bstein-dev-home-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -225,7 +283,7 @@
       "labels": {
         "app": "chat-ai-gateway"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "bstein-dev-home",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -249,6 +307,19 @@
         "python:3.11-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "comms",
+      "name": "comms-vault-sync",
+      "labels": {
+        "app": "comms-vault-sync"
+      },
+      "serviceAccountName": "comms-vault",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "comms",
@@ -256,7 +327,7 @@
       "labels": {
         "app": "coturn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -286,7 +357,7 @@
       "labels": {
         "app": "livekit"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -301,12 +372,12 @@
       "labels": {
         "app": "livekit-token-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
       "images": [
-        "ghcr.io/element-hq/lk-jwt-service:0.3.0"
+        "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0"
       ]
     },
     {
@@ -316,7 +387,7 @@
       "labels": {
         "app": "matrix-authentication-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -331,7 +402,7 @@
       "labels": {
         "app.kubernetes.io/name": "matrix-guest-register"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {},
       "images": [
         "python:3.11-slim"
@@ -365,6 +436,19 @@
         "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "crypto-vault-sync",
+      "labels": {
+        "app": "crypto-vault-sync"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "crypto",
@@ -372,7 +456,7 @@
       "labels": {
         "app": "monero-p2pool"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "crypto-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -395,6 +479,53 @@
         "registry.bstein.dev/crypto/monerod:0.18.4.1"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "labels": {
+        "app": "wallet-monero-temp"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "actual-budget",
+      "labels": {
+        "app": "actual-budget"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "firefly",
+      "labels": {
+        "app": "firefly"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "fireflyiii/core:version-6.4.15"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "flux-system",
@@ -516,7 +647,7 @@
       "labels": {
         "app": "gitea"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "gitea-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -524,6 +655,36 @@
         "gitea/gitea:1.23"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "harbor",
+      "name": "harbor-vault-sync",
+      "labels": {
+        "app": "harbor-vault-sync"
+      },
+      "serviceAccountName": "harbor-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "health",
+      "name": "wger",
+      "labels": {
+        "app": "wger"
+      },
+      "serviceAccountName": "health-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10",
+        "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "jellyfin",
@@ -531,7 +692,7 @@
       "labels": {
         "app": "jellyfin"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {},
       "images": [
         "docker.io/jellyfin/jellyfin:10.11.5"
@@ -544,14 +705,27 @@
       "labels": {
         "app": "pegasus"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
         "alpine:3.20",
-        "registry.bstein.dev/streaming/pegasus:1.2.32"
+        "registry.bstein.dev/streaming/pegasus-vault:1.2.32"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "jellyfin",
+      "name": "pegasus-vault-sync",
+      "labels": {
+        "app": "pegasus-vault-sync"
+      },
+      "serviceAccountName": "pegasus-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -570,6 +744,35 @@
         "jenkins/jenkins:2.528.3-jdk21"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "jenkins",
+      "name": "jenkins-vault-sync",
+      "labels": {
+        "app": "jenkins-vault-sync"
+      },
+      "serviceAccountName": "jenkins-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "kube-system",
+      "name": "ntp-sync",
+      "labels": {
+        "app": "ntp-sync"
+      },
+      "serviceAccountName": null,
+      "nodeSelector": {},
+      "images": [
+        "public.ecr.aws/docker/library/busybox:1.36.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "kube-system",
@@ -636,6 +839,21 @@
         "hashicorp/vault-csi-provider:1.7.0"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "kube-system",
+      "name": "coredns",
+      "labels": {
+        "k8s-app": "kube-dns"
+      },
+      "serviceAccountName": "coredns",
+      "nodeSelector": {
+        "kubernetes.io/os": "linux"
+      },
+      "images": [
+        "registry.bstein.dev/infra/coredns:1.12.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "logging",
@@ -681,6 +899,19 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "logging",
+      "name": "logging-vault-sync",
+      "labels": {
+        "app": "logging-vault-sync"
+      },
+      "serviceAccountName": "logging-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "logging",
@@ -688,12 +919,27 @@
       "labels": {
         "app": "oauth2-proxy-logs"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "logging-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "longhorn-system",
+      "name": "longhorn-vault-sync",
+      "labels": {
+        "app": "longhorn-vault-sync"
+      },
+      "serviceAccountName": "longhorn-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -703,7 +949,7 @@
       "labels": {
         "app": "oauth2-proxy-longhorn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "longhorn-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -729,14 +975,45 @@
     {
       "kind": "Deployment",
       "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "name": "mailu-vault-sync",
       "labels": {
-        "app": "mailu-sync-listener"
+        "app": "mailu-vault-sync"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "mailu-vault-sync",
       "nodeSelector": {},
       "images": [
-        "python:3.11-alpine"
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "disable-k3s-traefik",
+      "labels": {
+        "app": "disable-k3s-traefik"
+      },
+      "serviceAccountName": "disable-k3s-traefik",
+      "nodeSelector": {
+        "node-role.kubernetes.io/control-plane": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "k3s-agent-restart",
+      "labels": {
+        "app": "k3s-agent-restart"
+      },
+      "serviceAccountName": "node-nofile",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
     {
@@ -767,6 +1044,35 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "ariadne",
+      "labels": {
+        "app": "ariadne"
+      },
+      "serviceAccountName": "ariadne",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/bstein/ariadne:0.1.0-48"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "maintenance-vault-sync",
+      "labels": {
+        "app": "maintenance-vault-sync"
+      },
+      "serviceAccountName": "maintenance-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "monitoring",
@@ -795,6 +1101,19 @@
         "python:3.10-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "monitoring",
+      "name": "monitoring-vault-sync",
+      "labels": {
+        "app": "monitoring-vault-sync"
+      },
+      "serviceAccountName": "monitoring-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "monitoring",
@@ -802,7 +1121,7 @@
       "labels": {
         "app": "postmark-exporter"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "monitoring-vault-sync",
       "nodeSelector": {},
       "images": [
         "python:3.12-alpine"
@@ -830,7 +1149,7 @@
       "labels": {
         "app": "nextcloud"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "nextcloud-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -845,7 +1164,7 @@
       "labels": {
         "app": "outline"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "outline-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -875,7 +1194,7 @@
       "labels": {
         "app": "planka"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "planka-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -895,7 +1214,8 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "postgres:15"
+        "postgres:15",
+        "quay.io/prometheuscommunity/postgres-exporter:v0.15.0"
       ]
     },
     {
@@ -905,8 +1225,11 @@
       "labels": {
         "app": "keycloak"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "sso-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
         "quay.io/keycloak/keycloak:26.0.7"
       ]
@@ -918,12 +1241,25 @@
       "labels": {
         "app": "oauth2-proxy"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "sso",
+      "name": "sso-vault-sync",
+      "labels": {
+        "app": "sso-vault-sync"
+      },
+      "serviceAccountName": "sso-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -933,7 +1269,7 @@
       "labels": {
         "app": "openldap"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -951,7 +1287,7 @@
       },
       "serviceAccountName": "sui-metrics",
       "nodeSelector": {
-        "kubernetes.io/hostname": "titan-24"
+        "hardware": "rpi5"
       },
       "images": [
         "victoriametrics/vmagent:v1.103.0"
@@ -962,7 +1298,9 @@
       "namespace": "traefik",
       "name": "traefik",
       "labels": {
-        "app": "traefik"
+        "app": "traefik",
+        "app.kubernetes.io/instance": "traefik-kube-system",
+        "app.kubernetes.io/name": "traefik"
       },
       "serviceAccountName": "traefik-ingress-controller",
       "nodeSelector": {
@@ -995,8 +1333,11 @@
       "labels": {
         "app": "vaultwarden"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "vaultwarden-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
         "vaultwarden/server:1.35.2"
       ]
@@ -1565,6 +1906,54 @@
         }
       ]
     },
+    {
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wallet-monero-temp"
+      },
+      "ports": [
+        {
+          "name": "rpc",
+          "port": 18083,
+          "targetPort": 18083,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "actual-budget",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "actual-budget"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 5006,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "firefly",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "firefly"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 8080,
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "flux-system",
       "name": "notification-controller",
@@ -1632,7 +2021,7 @@
     {
       "namespace": "gitea",
       "name": "gitea-ssh",
-      "type": "NodePort",
+      "type": "LoadBalancer",
       "selector": {
         "app": "gitea"
       },
@@ -1645,6 +2034,22 @@
         }
       ]
     },
+    {
+      "namespace": "health",
+      "name": "wger",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wger"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": "http",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "jellyfin",
       "name": "jellyfin",
@@ -1699,29 +2104,6 @@
         }
       ]
     },
-    {
-      "namespace": "kube-system",
-      "name": "traefik",
-      "type": "LoadBalancer",
-      "selector": {
-        "app.kubernetes.io/instance": "traefik-kube-system",
-        "app.kubernetes.io/name": "traefik"
-      },
-      "ports": [
-        {
-          "name": "web",
-          "port": 80,
-          "targetPort": "web",
-          "protocol": "TCP"
-        },
-        {
-          "name": "websecure",
-          "port": 443,
-          "targetPort": "websecure",
-          "protocol": "TCP"
-        }
-      ]
-    },
     {
       "namespace": "logging",
       "name": "oauth2-proxy-logs",
@@ -1803,17 +2185,17 @@
       ]
     },
     {
-      "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "namespace": "maintenance",
+      "name": "ariadne",
       "type": "ClusterIP",
       "selector": {
-        "app": "mailu-sync-listener"
+        "app": "ariadne"
       },
       "ports": [
         {
           "name": "http",
-          "port": 8080,
-          "targetPort": 8080,
+          "port": 80,
+          "targetPort": "http",
           "protocol": "TCP"
         }
       ]
@@ -1959,6 +2341,12 @@
           "port": 5432,
           "targetPort": 5432,
           "protocol": "TCP"
+        },
+        {
+          "name": "metrics",
+          "port": 9187,
+          "targetPort": 9187,
+          "protocol": "TCP"
         }
       ]
     },
@@ -2032,6 +2420,28 @@
         }
       ]
     },
+    {
+      "namespace": "traefik",
+      "name": "traefik",
+      "type": "LoadBalancer",
+      "selector": {
+        "app": "traefik"
+      },
+      "ports": [
+        {
+          "name": "web",
+          "port": 80,
+          "targetPort": "web",
+          "protocol": "TCP"
+        },
+        {
+          "name": "websecure",
+          "port": 443,
+          "targetPort": "websecure",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "traefik",
       "name": "traefik-metrics",
@@ -2210,6 +2620,26 @@
         "source": "bstein-dev-home"
       }
     },
+    {
+      "host": "budget.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "actual-budget",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "actual-budget"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "actual-budget",
+        "source": "finance"
+      }
+    },
     {
       "host": "call.live.bstein.dev",
       "path": "/",
@@ -2290,6 +2720,26 @@
         "source": "nextcloud"
       }
     },
+    {
+      "host": "health.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "health",
+        "service": "wger",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "wger"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "wger",
+        "source": "health"
+      }
+    },
     {
       "host": "kit.live.bstein.dev",
       "path": "/livekit/jwt",
@@ -2385,6 +2835,106 @@
         "source": "comms"
       }
     },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/r0/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/login",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/logout",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/refresh",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
     {
       "host": "logs.bstein.dev",
       "path": "/",
@@ -2650,6 +3200,26 @@
         "source": "monerod"
       }
     },
+    {
+      "host": "money.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "firefly",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "firefly"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "firefly",
+        "source": "finance"
+      }
+    },
     {
       "host": "notes.bstein.dev",
       "path": "/",
@@ -2838,7 +3408,6 @@
       "matrix.live.bstein.dev"
     ],
     "comms:comms/othrys-synapse": [
-      "bstein.dev",
       "kit.live.bstein.dev",
       "live.bstein.dev",
       "matrix.live.bstein.dev",
@@ -2853,6 +3422,9 @@
     "logging:logging/data-prepper": [
       "registry.bstein.dev"
     ],
+    "longhorn:longhorn-system/longhorn": [
+      "registry.bstein.dev"
+    ],
     "mailu:mailu-mailserver/mailu": [
       "bstein.dev",
       "mail.bstein.dev"
@@ -2862,8 +3434,12 @@
     ],
     "monitoring:monitoring/grafana": [
       "bstein.dev",
+      "mail.bstein.dev",
       "metrics.bstein.dev",
       "sso.bstein.dev"
+    ],
+    "monitoring:monitoring/kube-state-metrics": [
+      "atlas.bstein.dev"
     ]
   }
 }
diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml
index 580a331b..b3b0119f 100644
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
@@ -8,6 +8,15 @@ sources:
 - name: bstein-dev-home
   path: services/bstein-dev-home
   targetNamespace: bstein-dev-home
+- name: bstein-dev-home-migrations
+  path: services/bstein-dev-home/migrations
+  targetNamespace: bstein-dev-home
+- name: cert-manager
+  path: infrastructure/cert-manager
+  targetNamespace: cert-manager
+- name: cert-manager-cleanup
+  path: infrastructure/cert-manager/cleanup
+  targetNamespace: cert-manager
 - name: comms
   path: services/comms
   targetNamespace: comms
@@ -17,6 +26,9 @@ sources:
 - name: crypto
   path: services/crypto
   targetNamespace: crypto
+- name: finance
+  path: services/finance
+  targetNamespace: finance
 - name: flux-system
   path: clusters/atlas/flux-system
   targetNamespace: null
@@ -29,6 +41,9 @@ sources:
 - name: harbor
   path: services/harbor
   targetNamespace: harbor
+- name: health
+  path: services/health
+  targetNamespace: health
 - name: helm
   path: infrastructure/sources/helm
   targetNamespace: flux-system
@@ -44,6 +59,12 @@ sources:
 - name: logging
   path: services/logging
   targetNamespace: null
+- name: longhorn
+  path: infrastructure/longhorn/core
+  targetNamespace: longhorn-system
+- name: longhorn-adopt
+  path: infrastructure/longhorn/adopt
+  targetNamespace: longhorn-system
 - name: longhorn-ui
   path: infrastructure/longhorn/ui-ingress
   targetNamespace: longhorn-system
@@ -98,9 +119,15 @@ sources:
 - name: vault-csi
   path: infrastructure/vault-csi
   targetNamespace: kube-system
+- name: vault-injector
+  path: infrastructure/vault-injector
+  targetNamespace: vault
 - name: vaultwarden
   path: services/vaultwarden
   targetNamespace: vaultwarden
+- name: wallet-monero-temp
+  path: services/crypto/wallet-monero-temp
+  targetNamespace: crypto
 - name: xmr-miner
   path: services/crypto/xmr-miner
   targetNamespace: crypto
@@ -124,7 +151,7 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157
 - kind: Deployment
   namespace: bstein-dev-home
   name: bstein-dev-home-frontend
@@ -135,13 +162,22 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157
+- kind: Deployment
+  namespace: bstein-dev-home
+  name: bstein-dev-home-vault-sync
+  labels:
+    app: bstein-dev-home-vault-sync
+  serviceAccountName: bstein-dev-home-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: bstein-dev-home
   name: chat-ai-gateway
   labels:
     app: chat-ai-gateway
-  serviceAccountName: null
+  serviceAccountName: bstein-dev-home
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -157,12 +193,21 @@ workloads:
     hardware: rpi5
   images:
   - python:3.11-slim
+- kind: Deployment
+  namespace: comms
+  name: comms-vault-sync
+  labels:
+    app: comms-vault-sync
+  serviceAccountName: comms-vault
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: comms
   name: coturn
   labels:
     app: coturn
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -182,7 +227,7 @@ workloads:
   name: livekit
   labels:
     app: livekit
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -192,17 +237,17 @@ workloads:
   name: livekit-token-service
   labels:
     app: livekit-token-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
-  - ghcr.io/element-hq/lk-jwt-service:0.3.0
+  - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0
 - kind: Deployment
   namespace: comms
   name: matrix-authentication-service
   labels:
     app: matrix-authentication-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -212,7 +257,7 @@ workloads:
   name: matrix-guest-register
   labels:
     app.kubernetes.io/name: matrix-guest-register
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector: {}
   images:
   - python:3.11-slim
@@ -235,12 +280,21 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
+- kind: Deployment
+  namespace: crypto
+  name: crypto-vault-sync
+  labels:
+    app: crypto-vault-sync
+  serviceAccountName: crypto-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: crypto
   name: monero-p2pool
   labels:
     app: monero-p2pool
-  serviceAccountName: null
+  serviceAccountName: crypto-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -255,6 +309,38 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - registry.bstein.dev/crypto/monerod:0.18.4.1
+- kind: Deployment
+  namespace: crypto
+  name: wallet-monero-temp
+  labels:
+    app: wallet-monero-temp
+  serviceAccountName: crypto-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1
+- kind: Deployment
+  namespace: finance
+  name: actual-budget
+  labels:
+    app: actual-budget
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d
+- kind: Deployment
+  namespace: finance
+  name: firefly
+  labels:
+    app: firefly
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - fireflyiii/core:version-6.4.15
 - kind: Deployment
   namespace: flux-system
   name: helm-controller
@@ -344,17 +430,38 @@ workloads:
   name: gitea
   labels:
     app: gitea
-  serviceAccountName: null
+  serviceAccountName: gitea-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
   - gitea/gitea:1.23
+- kind: Deployment
+  namespace: harbor
+  name: harbor-vault-sync
+  labels:
+    app: harbor-vault-sync
+  serviceAccountName: harbor-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
+- kind: Deployment
+  namespace: health
+  name: wger
+  labels:
+    app: wger
+  serviceAccountName: health-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10
+  - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5
 - kind: Deployment
   namespace: jellyfin
   name: jellyfin
   labels:
     app: jellyfin
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector: {}
   images:
   - docker.io/jellyfin/jellyfin:10.11.5
@@ -363,13 +470,22 @@ workloads:
   name: pegasus
   labels:
     app: pegasus
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
   - alpine:3.20
-  - registry.bstein.dev/streaming/pegasus:1.2.32
+  - registry.bstein.dev/streaming/pegasus-vault:1.2.32
+- kind: Deployment
+  namespace: jellyfin
+  name: pegasus-vault-sync
+  labels:
+    app: pegasus-vault-sync
+  serviceAccountName: pegasus-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: jenkins
   name: jenkins
@@ -381,6 +497,26 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - jenkins/jenkins:2.528.3-jdk21
+- kind: Deployment
+  namespace: jenkins
+  name: jenkins-vault-sync
+  labels:
+    app: jenkins-vault-sync
+  serviceAccountName: jenkins-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: kube-system
+  name: ntp-sync
+  labels:
+    app: ntp-sync
+  serviceAccountName: null
+  nodeSelector: {}
+  images:
+  - public.ecr.aws/docker/library/busybox:1.36.1
 - kind: DaemonSet
   namespace: kube-system
   name: nvidia-device-plugin-jetson
@@ -427,6 +563,16 @@ workloads:
     kubernetes.io/os: linux
   images:
   - hashicorp/vault-csi-provider:1.7.0
+- kind: Deployment
+  namespace: kube-system
+  name: coredns
+  labels:
+    k8s-app: kube-dns
+  serviceAccountName: coredns
+  nodeSelector:
+    kubernetes.io/os: linux
+  images:
+  - registry.bstein.dev/infra/coredns:1.12.1
 - kind: DaemonSet
   namespace: logging
   name: node-image-gc-rpi4
@@ -457,22 +603,41 @@ workloads:
     hardware: rpi5
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: logging
+  name: logging-vault-sync
+  labels:
+    app: logging-vault-sync
+  serviceAccountName: logging-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: logging
   name: oauth2-proxy-logs
   labels:
     app: oauth2-proxy-logs
-  serviceAccountName: null
+  serviceAccountName: logging-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: longhorn-system
+  name: longhorn-vault-sync
+  labels:
+    app: longhorn-vault-sync
+  serviceAccountName: longhorn-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: longhorn-system
   name: oauth2-proxy-longhorn
   labels:
     app: oauth2-proxy-longhorn
-  serviceAccountName: null
+  serviceAccountName: longhorn-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -489,13 +654,34 @@ workloads:
   - registry.bstein.dev/bstein/kubectl:1.35.0
 - kind: Deployment
   namespace: mailu-mailserver
-  name: mailu-sync-listener
+  name: mailu-vault-sync
   labels:
-    app: mailu-sync-listener
-  serviceAccountName: null
+    app: mailu-vault-sync
+  serviceAccountName: mailu-vault-sync
   nodeSelector: {}
   images:
-  - python:3.11-alpine
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: maintenance
+  name: disable-k3s-traefik
+  labels:
+    app: disable-k3s-traefik
+  serviceAccountName: disable-k3s-traefik
+  nodeSelector:
+    node-role.kubernetes.io/control-plane: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: DaemonSet
+  namespace: maintenance
+  name: k3s-agent-restart
+  labels:
+    app: k3s-agent-restart
+  serviceAccountName: node-nofile
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
 - kind: DaemonSet
   namespace: maintenance
   name: node-image-sweeper
@@ -515,6 +701,26 @@ workloads:
   nodeSelector: {}
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: maintenance
+  name: ariadne
+  labels:
+    app: ariadne
+  serviceAccountName: ariadne
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/bstein/ariadne:0.1.0-48
+- kind: Deployment
+  namespace: maintenance
+  name: maintenance-vault-sync
+  labels:
+    app: maintenance-vault-sync
+  serviceAccountName: maintenance-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: DaemonSet
   namespace: monitoring
   name: dcgm-exporter
@@ -534,12 +740,21 @@ workloads:
     jetson: 'true'
   images:
   - python:3.10-slim
+- kind: Deployment
+  namespace: monitoring
+  name: monitoring-vault-sync
+  labels:
+    app: monitoring-vault-sync
+  serviceAccountName: monitoring-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: monitoring
   name: postmark-exporter
   labels:
     app: postmark-exporter
-  serviceAccountName: null
+  serviceAccountName: monitoring-vault-sync
   nodeSelector: {}
   images:
   - python:3.12-alpine
@@ -558,7 +773,7 @@ workloads:
   name: nextcloud
   labels:
     app: nextcloud
-  serviceAccountName: null
+  serviceAccountName: nextcloud-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -568,7 +783,7 @@ workloads:
   name: outline
   labels:
     app: outline
-  serviceAccountName: null
+  serviceAccountName: outline-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -588,7 +803,7 @@ workloads:
   name: planka
   labels:
     app: planka
-  serviceAccountName: null
+  serviceAccountName: planka-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -603,13 +818,16 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - postgres:15
+  - quay.io/prometheuscommunity/postgres-exporter:v0.15.0
 - kind: Deployment
   namespace: sso
   name: keycloak
   labels:
     app: keycloak
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: sso-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
   - quay.io/keycloak/keycloak:26.0.7
 - kind: Deployment
@@ -617,17 +835,26 @@ workloads:
   name: oauth2-proxy
   labels:
     app: oauth2-proxy
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: sso
+  name: sso-vault-sync
+  labels:
+    app: sso-vault-sync
+  serviceAccountName: sso-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: StatefulSet
   namespace: sso
   name: openldap
   labels:
     app: openldap
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -640,7 +867,7 @@ workloads:
     app: sui-metrics
   serviceAccountName: sui-metrics
   nodeSelector:
-    kubernetes.io/hostname: titan-24
+    hardware: rpi5
   images:
   - victoriametrics/vmagent:v1.103.0
 - kind: Deployment
@@ -648,6 +875,8 @@ workloads:
   name: traefik
   labels:
     app: traefik
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
   serviceAccountName: traefik-ingress-controller
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
@@ -669,8 +898,10 @@ workloads:
   name: vaultwarden
   labels:
     app: vaultwarden
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: vaultwarden-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
   - vaultwarden/server:1.35.2
 services:
@@ -1040,6 +1271,36 @@ services:
     port: 3333
     targetPort: 3333
     protocol: TCP
+- namespace: crypto
+  name: wallet-monero-temp
+  type: ClusterIP
+  selector:
+    app: wallet-monero-temp
+  ports:
+  - name: rpc
+    port: 18083
+    targetPort: 18083
+    protocol: TCP
+- namespace: finance
+  name: actual-budget
+  type: ClusterIP
+  selector:
+    app: actual-budget
+  ports:
+  - name: http
+    port: 80
+    targetPort: 5006
+    protocol: TCP
+- namespace: finance
+  name: firefly
+  type: ClusterIP
+  selector:
+    app: firefly
+  ports:
+  - name: http
+    port: 80
+    targetPort: 8080
+    protocol: TCP
 - namespace: flux-system
   name: notification-controller
   type: ClusterIP
@@ -1082,7 +1343,7 @@ services:
     protocol: TCP
 - namespace: gitea
   name: gitea-ssh
-  type: NodePort
+  type: LoadBalancer
   selector:
     app: gitea
   ports:
@@ -1090,6 +1351,16 @@ services:
     port: 2242
     targetPort: 2242
     protocol: TCP
+- namespace: health
+  name: wger
+  type: ClusterIP
+  selector:
+    app: wger
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
+    protocol: TCP
 - namespace: jellyfin
   name: jellyfin
   type: ClusterIP
@@ -1124,21 +1395,6 @@ services:
     port: 50000
     targetPort: 50000
     protocol: TCP
-- namespace: kube-system
-  name: traefik
-  type: LoadBalancer
-  selector:
-    app.kubernetes.io/instance: traefik-kube-system
-    app.kubernetes.io/name: traefik
-  ports:
-  - name: web
-    port: 80
-    targetPort: web
-    protocol: TCP
-  - name: websecure
-    port: 443
-    targetPort: websecure
-    protocol: TCP
 - namespace: logging
   name: oauth2-proxy-logs
   type: ClusterIP
@@ -1191,15 +1447,15 @@ services:
     port: 4190
     targetPort: 4190
     protocol: TCP
-- namespace: mailu-mailserver
-  name: mailu-sync-listener
+- namespace: maintenance
+  name: ariadne
   type: ClusterIP
   selector:
-    app: mailu-sync-listener
+    app: ariadne
   ports:
   - name: http
-    port: 8080
-    targetPort: 8080
+    port: 80
+    targetPort: http
     protocol: TCP
 - namespace: monitoring
   name: dcgm-exporter
@@ -1291,6 +1547,10 @@ services:
     port: 5432
     targetPort: 5432
     protocol: TCP
+  - name: metrics
+    port: 9187
+    targetPort: 9187
+    protocol: TCP
 - namespace: sso
   name: keycloak
   type: ClusterIP
@@ -1335,6 +1595,20 @@ services:
     port: 8429
     targetPort: 8429
     protocol: TCP
+- namespace: traefik
+  name: traefik
+  type: LoadBalancer
+  selector:
+    app: traefik
+  ports:
+  - name: web
+    port: 80
+    targetPort: web
+    protocol: TCP
+  - name: websecure
+    port: 443
+    targetPort: websecure
+    protocol: TCP
 - namespace: traefik
   name: traefik-metrics
   type: ClusterIP
@@ -1447,6 +1721,19 @@ http_endpoints:
     kind: Ingress
     name: bstein-dev-home
     source: bstein-dev-home
+- host: budget.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: actual-budget
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: actual-budget
+  via:
+    kind: Ingress
+    name: actual-budget
+    source: finance
 - host: call.live.bstein.dev
   path: /
   backend:
@@ -1499,6 +1786,19 @@ http_endpoints:
     kind: Ingress
     name: nextcloud
     source: nextcloud
+- host: health.bstein.dev
+  path: /
+  backend:
+    namespace: health
+    service: wger
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: wger
+  via:
+    kind: Ingress
+    name: wger
+    source: health
 - host: kit.live.bstein.dev
   path: /livekit/jwt
   backend:
@@ -1558,6 +1858,65 @@ http_endpoints:
     kind: Ingress
     name: matrix-routing
     source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/r0/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: &id003
+    - kind: Deployment
+      name: matrix-guest-register
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/login
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: &id002
+    - kind: Deployment
+      name: matrix-authentication-service
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/logout
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/refresh
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: *id003
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
 - host: logs.bstein.dev
   path: /
   backend:
@@ -1601,9 +1960,7 @@ http_endpoints:
     namespace: comms
     service: matrix-authentication-service
     port: 8080
-    workloads: &id002
-    - kind: Deployment
-      name: matrix-authentication-service
+    workloads: *id002
   via:
     kind: Ingress
     name: matrix-routing
@@ -1647,9 +2004,7 @@ http_endpoints:
     namespace: comms
     service: matrix-guest-register
     port: 8080
-    workloads: &id003
-    - kind: Deployment
-      name: matrix-guest-register
+    workloads: *id003
   via:
     kind: Ingress
     name: matrix-routing
@@ -1722,6 +2077,19 @@ http_endpoints:
     kind: Ingress
     name: monerod
     source: monerod
+- host: money.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: firefly
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: firefly
+  via:
+    kind: Ingress
+    name: firefly
+    source: finance
 - host: notes.bstein.dev
   path: /
   backend:
@@ -1845,7 +2213,6 @@ helmrelease_host_hints:
   - live.bstein.dev
   - matrix.live.bstein.dev
   comms:comms/othrys-synapse:
-  - bstein.dev
   - kit.live.bstein.dev
   - live.bstein.dev
   - matrix.live.bstein.dev
@@ -1856,6 +2223,8 @@ helmrelease_host_hints:
   - registry.bstein.dev
   logging:logging/data-prepper:
   - registry.bstein.dev
+  longhorn:longhorn-system/longhorn:
+  - registry.bstein.dev
   mailu:mailu-mailserver/mailu:
   - bstein.dev
   - mail.bstein.dev
@@ -1863,5 +2232,8 @@ helmrelease_host_hints:
   - alerts.bstein.dev
   monitoring:monitoring/grafana:
   - bstein.dev
+  - mail.bstein.dev
   - metrics.bstein.dev
   - sso.bstein.dev
+  monitoring:monitoring/kube-state-metrics:
+  - atlas.bstein.dev
diff --git a/knowledge/diagrams/atlas-http.mmd b/knowledge/diagrams/atlas-http.mmd
index ab7c3621..1aa7ac80 100644
--- a/knowledge/diagrams/atlas-http.mmd
+++ b/knowledge/diagrams/atlas-http.mmd
@@ -17,6 +17,11 @@ flowchart LR
   host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
   wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
   svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
+  host_budget_bstein_dev["budget.bstein.dev"]
+  svc_finance_actual_budget["finance/actual-budget (Service)"]
+  host_budget_bstein_dev --> svc_finance_actual_budget
+  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
+  svc_finance_actual_budget --> wl_finance_actual_budget
   host_call_live_bstein_dev["call.live.bstein.dev"]
   svc_comms_element_call["comms/element-call (Service)"]
   host_call_live_bstein_dev --> svc_comms_element_call
@@ -37,6 +42,11 @@ flowchart LR
   host_cloud_bstein_dev --> svc_nextcloud_nextcloud
   wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
   svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
+  host_health_bstein_dev["health.bstein.dev"]
+  svc_health_wger["health/wger (Service)"]
+  host_health_bstein_dev --> svc_health_wger
+  wl_health_wger["health/wger (Deployment)"]
+  svc_health_wger --> wl_health_wger
   host_kit_live_bstein_dev["kit.live.bstein.dev"]
   svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
   host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@@ -50,6 +60,14 @@ flowchart LR
   host_live_bstein_dev --> svc_comms_matrix_wellknown
   svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
   host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_logs_bstein_dev["logs.bstein.dev"]
   svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
   host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
@@ -64,21 +82,20 @@ flowchart LR
   svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
   host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
   host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
-  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
-  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
-  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
   host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
-  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
-  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
   host_monero_bstein_dev["monero.bstein.dev"]
   svc_crypto_monerod["crypto/monerod (Service)"]
   host_monero_bstein_dev --> svc_crypto_monerod
   wl_crypto_monerod["crypto/monerod (Deployment)"]
   svc_crypto_monerod --> wl_crypto_monerod
+  host_money_bstein_dev["money.bstein.dev"]
+  svc_finance_firefly["finance/firefly (Service)"]
+  host_money_bstein_dev --> svc_finance_firefly
+  wl_finance_firefly["finance/firefly (Deployment)"]
+  svc_finance_firefly --> wl_finance_firefly
   host_notes_bstein_dev["notes.bstein.dev"]
   svc_outline_outline["outline/outline (Service)"]
   host_notes_bstein_dev --> svc_outline_outline
@@ -143,19 +160,29 @@ flowchart LR
     svc_comms_livekit
     wl_comms_livekit
     svc_comms_othrys_synapse_matrix_synapse
-    svc_comms_matrix_authentication_service
-    wl_comms_matrix_authentication_service
     svc_comms_matrix_guest_register
     wl_comms_matrix_guest_register
+    svc_comms_matrix_authentication_service
+    wl_comms_matrix_authentication_service
   end
   subgraph crypto[crypto]
     svc_crypto_monerod
     wl_crypto_monerod
   end
+  subgraph finance[finance]
+    svc_finance_actual_budget
+    wl_finance_actual_budget
+    svc_finance_firefly
+    wl_finance_firefly
+  end
   subgraph gitea[gitea]
     svc_gitea_gitea
     wl_gitea_gitea
   end
+  subgraph health[health]
+    svc_health_wger
+    wl_health_wger
+  end
   subgraph jellyfin[jellyfin]
     svc_jellyfin_pegasus
     wl_jellyfin_pegasus
diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py
index c7f9f26f..34938e74 100644
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@@ -20,6 +20,7 @@ import subprocess
 import sys
 from dataclasses import dataclass
 from pathlib import Path
+import shutil
 from typing import Any, Iterable
 
 import yaml
@@ -60,6 +61,12 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
     return res.stdout
 
 
+def _sync_tree(source: Path, dest: Path) -> None:
+    if dest.exists():
+        shutil.rmtree(dest)
+    shutil.copytree(source, dest)
+
+
 def kustomize_build(path: Path) -> str:
     rel = path.relative_to(REPO_ROOT)
     try:
@@ -472,6 +479,11 @@ def main() -> int:
         action="store_true",
         help="Write generated files (otherwise just print a summary).",
     )
+    ap.add_argument(
+        "--sync-comms",
+        action="store_true",
+        help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
+    )
     args = ap.parse_args()
 
     out_dir = REPO_ROOT / args.out
@@ -549,6 +561,11 @@ def main() -> int:
     print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
+
+    if args.sync_comms:
+        comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
+        _sync_tree(out_dir, comms_dir)
+        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
     return 0
 
 
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7aedf4a0..70844ebf 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-8
+        checksum/atlasbot-configmap: manual-atlasbot-9
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -73,6 +73,8 @@ spec:
               value: /kb
             - name: VM_URL
               value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
+            - name: ARIADNE_STATE_URL
+              value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
             - name: BOT_USER
               value: atlasbot
             - name: BOT_MENTIONS
diff --git a/services/comms/knowledge/catalog/atlas-summary.json b/services/comms/knowledge/catalog/atlas-summary.json
index fa350516..ea825ce7 100644
--- a/services/comms/knowledge/catalog/atlas-summary.json
+++ b/services/comms/knowledge/catalog/atlas-summary.json
@@ -1,8 +1,8 @@
 {
   "counts": {
-    "helmrelease_host_hints": 17,
-    "http_endpoints": 37,
-    "services": 43,
-    "workloads": 54
+    "helmrelease_host_hints": 19,
+    "http_endpoints": 45,
+    "services": 47,
+    "workloads": 74
   }
 }
diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json
index 18cb6b64..21ac4073 100644
--- a/services/comms/knowledge/catalog/atlas.json
+++ b/services/comms/knowledge/catalog/atlas.json
@@ -11,6 +11,21 @@
       "path": "services/bstein-dev-home",
       "targetNamespace": "bstein-dev-home"
     },
+    {
+      "name": "bstein-dev-home-migrations",
+      "path": "services/bstein-dev-home/migrations",
+      "targetNamespace": "bstein-dev-home"
+    },
+    {
+      "name": "cert-manager",
+      "path": "infrastructure/cert-manager",
+      "targetNamespace": "cert-manager"
+    },
+    {
+      "name": "cert-manager-cleanup",
+      "path": "infrastructure/cert-manager/cleanup",
+      "targetNamespace": "cert-manager"
+    },
     {
       "name": "comms",
       "path": "services/comms",
@@ -26,6 +41,11 @@
       "path": "services/crypto",
       "targetNamespace": "crypto"
     },
+    {
+      "name": "finance",
+      "path": "services/finance",
+      "targetNamespace": "finance"
+    },
     {
       "name": "flux-system",
       "path": "clusters/atlas/flux-system",
@@ -46,6 +66,11 @@
       "path": "services/harbor",
       "targetNamespace": "harbor"
     },
+    {
+      "name": "health",
+      "path": "services/health",
+      "targetNamespace": "health"
+    },
     {
       "name": "helm",
       "path": "infrastructure/sources/helm",
@@ -71,6 +96,16 @@
       "path": "services/logging",
       "targetNamespace": null
     },
+    {
+      "name": "longhorn",
+      "path": "infrastructure/longhorn/core",
+      "targetNamespace": "longhorn-system"
+    },
+    {
+      "name": "longhorn-adopt",
+      "path": "infrastructure/longhorn/adopt",
+      "targetNamespace": "longhorn-system"
+    },
     {
       "name": "longhorn-ui",
       "path": "infrastructure/longhorn/ui-ingress",
@@ -161,11 +196,21 @@
       "path": "infrastructure/vault-csi",
       "targetNamespace": "kube-system"
     },
+    {
+      "name": "vault-injector",
+      "path": "infrastructure/vault-injector",
+      "targetNamespace": "vault"
+    },
     {
       "name": "vaultwarden",
       "path": "services/vaultwarden",
       "targetNamespace": "vaultwarden"
     },
+    {
+      "name": "wallet-monero-temp",
+      "path": "services/crypto/wallet-monero-temp",
+      "targetNamespace": "crypto"
+    },
     {
       "name": "xmr-miner",
       "path": "services/crypto/xmr-miner",
@@ -199,7 +244,7 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157"
       ]
     },
     {
@@ -215,7 +260,20 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92"
+        "registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "bstein-dev-home",
+      "name": "bstein-dev-home-vault-sync",
+      "labels": {
+        "app": "bstein-dev-home-vault-sync"
+      },
+      "serviceAccountName": "bstein-dev-home-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -225,7 +283,7 @@
       "labels": {
         "app": "chat-ai-gateway"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "bstein-dev-home",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -249,6 +307,19 @@
         "python:3.11-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "comms",
+      "name": "comms-vault-sync",
+      "labels": {
+        "app": "comms-vault-sync"
+      },
+      "serviceAccountName": "comms-vault",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "comms",
@@ -256,7 +327,7 @@
       "labels": {
         "app": "coturn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -286,7 +357,7 @@
       "labels": {
         "app": "livekit"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -301,12 +372,12 @@
       "labels": {
         "app": "livekit-token-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
       "images": [
-        "ghcr.io/element-hq/lk-jwt-service:0.3.0"
+        "registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0"
       ]
     },
     {
@@ -316,7 +387,7 @@
       "labels": {
         "app": "matrix-authentication-service"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -331,7 +402,7 @@
       "labels": {
         "app.kubernetes.io/name": "matrix-guest-register"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "comms-vault",
       "nodeSelector": {},
       "images": [
         "python:3.11-slim"
@@ -365,6 +436,19 @@
         "ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "crypto-vault-sync",
+      "labels": {
+        "app": "crypto-vault-sync"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "crypto",
@@ -372,7 +456,7 @@
       "labels": {
         "app": "monero-p2pool"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "crypto-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -395,6 +479,53 @@
         "registry.bstein.dev/crypto/monerod:0.18.4.1"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "labels": {
+        "app": "wallet-monero-temp"
+      },
+      "serviceAccountName": "crypto-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "actual-budget",
+      "labels": {
+        "app": "actual-budget"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "finance",
+      "name": "firefly",
+      "labels": {
+        "app": "firefly"
+      },
+      "serviceAccountName": "finance-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "fireflyiii/core:version-6.4.15"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "flux-system",
@@ -516,7 +647,7 @@
       "labels": {
         "app": "gitea"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "gitea-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -524,6 +655,36 @@
         "gitea/gitea:1.23"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "harbor",
+      "name": "harbor-vault-sync",
+      "labels": {
+        "app": "harbor-vault-sync"
+      },
+      "serviceAccountName": "harbor-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "health",
+      "name": "wger",
+      "labels": {
+        "app": "wger"
+      },
+      "serviceAccountName": "health-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10",
+        "wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "jellyfin",
@@ -531,7 +692,7 @@
       "labels": {
         "app": "jellyfin"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {},
       "images": [
         "docker.io/jellyfin/jellyfin:10.11.5"
@@ -544,14 +705,27 @@
       "labels": {
         "app": "pegasus"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "pegasus-vault-sync",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
         "alpine:3.20",
-        "registry.bstein.dev/streaming/pegasus:1.2.32"
+        "registry.bstein.dev/streaming/pegasus-vault:1.2.32"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "jellyfin",
+      "name": "pegasus-vault-sync",
+      "labels": {
+        "app": "pegasus-vault-sync"
+      },
+      "serviceAccountName": "pegasus-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -570,6 +744,35 @@
         "jenkins/jenkins:2.528.3-jdk21"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "jenkins",
+      "name": "jenkins-vault-sync",
+      "labels": {
+        "app": "jenkins-vault-sync"
+      },
+      "serviceAccountName": "jenkins-vault-sync",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "kube-system",
+      "name": "ntp-sync",
+      "labels": {
+        "app": "ntp-sync"
+      },
+      "serviceAccountName": null,
+      "nodeSelector": {},
+      "images": [
+        "public.ecr.aws/docker/library/busybox:1.36.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "kube-system",
@@ -636,6 +839,21 @@
         "hashicorp/vault-csi-provider:1.7.0"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "kube-system",
+      "name": "coredns",
+      "labels": {
+        "k8s-app": "kube-dns"
+      },
+      "serviceAccountName": "coredns",
+      "nodeSelector": {
+        "kubernetes.io/os": "linux"
+      },
+      "images": [
+        "registry.bstein.dev/infra/coredns:1.12.1"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "logging",
@@ -681,6 +899,19 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "logging",
+      "name": "logging-vault-sync",
+      "labels": {
+        "app": "logging-vault-sync"
+      },
+      "serviceAccountName": "logging-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "logging",
@@ -688,12 +919,27 @@
       "labels": {
         "app": "oauth2-proxy-logs"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "logging-vault-sync",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "longhorn-system",
+      "name": "longhorn-vault-sync",
+      "labels": {
+        "app": "longhorn-vault-sync"
+      },
+      "serviceAccountName": "longhorn-vault-sync",
+      "nodeSelector": {
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -703,7 +949,7 @@
       "labels": {
         "app": "oauth2-proxy-longhorn"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "longhorn-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -729,14 +975,45 @@
     {
       "kind": "Deployment",
       "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "name": "mailu-vault-sync",
       "labels": {
-        "app": "mailu-sync-listener"
+        "app": "mailu-vault-sync"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "mailu-vault-sync",
       "nodeSelector": {},
       "images": [
-        "python:3.11-alpine"
+        "alpine:3.20"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "disable-k3s-traefik",
+      "labels": {
+        "app": "disable-k3s-traefik"
+      },
+      "serviceAccountName": "disable-k3s-traefik",
+      "nodeSelector": {
+        "node-role.kubernetes.io/control-plane": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
+      ]
+    },
+    {
+      "kind": "DaemonSet",
+      "namespace": "maintenance",
+      "name": "k3s-agent-restart",
+      "labels": {
+        "app": "k3s-agent-restart"
+      },
+      "serviceAccountName": "node-nofile",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
     {
@@ -767,6 +1044,35 @@
         "bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "ariadne",
+      "labels": {
+        "app": "ariadne"
+      },
+      "serviceAccountName": "ariadne",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
+      "images": [
+        "registry.bstein.dev/bstein/ariadne:0.1.0-48"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "maintenance",
+      "name": "maintenance-vault-sync",
+      "labels": {
+        "app": "maintenance-vault-sync"
+      },
+      "serviceAccountName": "maintenance-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "DaemonSet",
       "namespace": "monitoring",
@@ -795,6 +1101,19 @@
         "python:3.10-slim"
       ]
     },
+    {
+      "kind": "Deployment",
+      "namespace": "monitoring",
+      "name": "monitoring-vault-sync",
+      "labels": {
+        "app": "monitoring-vault-sync"
+      },
+      "serviceAccountName": "monitoring-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
+      ]
+    },
     {
       "kind": "Deployment",
       "namespace": "monitoring",
@@ -802,7 +1121,7 @@
       "labels": {
         "app": "postmark-exporter"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "monitoring-vault-sync",
       "nodeSelector": {},
       "images": [
         "python:3.12-alpine"
@@ -830,7 +1149,7 @@
       "labels": {
         "app": "nextcloud"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "nextcloud-vault",
       "nodeSelector": {
         "hardware": "rpi5"
       },
@@ -845,7 +1164,7 @@
       "labels": {
         "app": "outline"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "outline-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -875,7 +1194,7 @@
       "labels": {
         "app": "planka"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "planka-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
@@ -895,7 +1214,8 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "postgres:15"
+        "postgres:15",
+        "quay.io/prometheuscommunity/postgres-exporter:v0.15.0"
       ]
     },
     {
@@ -905,8 +1225,11 @@
       "labels": {
         "app": "keycloak"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "sso-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
         "quay.io/keycloak/keycloak:26.0.7"
       ]
@@ -918,12 +1241,25 @@
       "labels": {
         "app": "oauth2-proxy"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "quay.io/oauth2-proxy/oauth2-proxy:v7.6.0"
+        "registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0"
+      ]
+    },
+    {
+      "kind": "Deployment",
+      "namespace": "sso",
+      "name": "sso-vault-sync",
+      "labels": {
+        "app": "sso-vault-sync"
+      },
+      "serviceAccountName": "sso-vault-sync",
+      "nodeSelector": {},
+      "images": [
+        "alpine:3.20"
       ]
     },
     {
@@ -933,7 +1269,7 @@
       "labels": {
         "app": "openldap"
       },
-      "serviceAccountName": null,
+      "serviceAccountName": "sso-vault",
       "nodeSelector": {
         "kubernetes.io/arch": "arm64",
         "node-role.kubernetes.io/worker": "true"
@@ -951,7 +1287,7 @@
       },
       "serviceAccountName": "sui-metrics",
       "nodeSelector": {
-        "kubernetes.io/hostname": "titan-24"
+        "hardware": "rpi5"
       },
       "images": [
         "victoriametrics/vmagent:v1.103.0"
@@ -962,7 +1298,9 @@
       "namespace": "traefik",
       "name": "traefik",
       "labels": {
-        "app": "traefik"
+        "app": "traefik",
+        "app.kubernetes.io/instance": "traefik-kube-system",
+        "app.kubernetes.io/name": "traefik"
       },
       "serviceAccountName": "traefik-ingress-controller",
       "nodeSelector": {
@@ -995,8 +1333,11 @@
       "labels": {
         "app": "vaultwarden"
       },
-      "serviceAccountName": null,
-      "nodeSelector": {},
+      "serviceAccountName": "vaultwarden-vault",
+      "nodeSelector": {
+        "kubernetes.io/arch": "arm64",
+        "node-role.kubernetes.io/worker": "true"
+      },
       "images": [
         "vaultwarden/server:1.35.2"
       ]
@@ -1565,6 +1906,54 @@
         }
       ]
     },
+    {
+      "namespace": "crypto",
+      "name": "wallet-monero-temp",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wallet-monero-temp"
+      },
+      "ports": [
+        {
+          "name": "rpc",
+          "port": 18083,
+          "targetPort": 18083,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "actual-budget",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "actual-budget"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 5006,
+          "protocol": "TCP"
+        }
+      ]
+    },
+    {
+      "namespace": "finance",
+      "name": "firefly",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "firefly"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": 8080,
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "flux-system",
       "name": "notification-controller",
@@ -1632,7 +2021,7 @@
     {
       "namespace": "gitea",
       "name": "gitea-ssh",
-      "type": "NodePort",
+      "type": "LoadBalancer",
       "selector": {
         "app": "gitea"
       },
@@ -1645,6 +2034,22 @@
         }
       ]
     },
+    {
+      "namespace": "health",
+      "name": "wger",
+      "type": "ClusterIP",
+      "selector": {
+        "app": "wger"
+      },
+      "ports": [
+        {
+          "name": "http",
+          "port": 80,
+          "targetPort": "http",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "jellyfin",
       "name": "jellyfin",
@@ -1699,29 +2104,6 @@
         }
       ]
     },
-    {
-      "namespace": "kube-system",
-      "name": "traefik",
-      "type": "LoadBalancer",
-      "selector": {
-        "app.kubernetes.io/instance": "traefik-kube-system",
-        "app.kubernetes.io/name": "traefik"
-      },
-      "ports": [
-        {
-          "name": "web",
-          "port": 80,
-          "targetPort": "web",
-          "protocol": "TCP"
-        },
-        {
-          "name": "websecure",
-          "port": 443,
-          "targetPort": "websecure",
-          "protocol": "TCP"
-        }
-      ]
-    },
     {
       "namespace": "logging",
       "name": "oauth2-proxy-logs",
@@ -1803,17 +2185,17 @@
       ]
     },
     {
-      "namespace": "mailu-mailserver",
-      "name": "mailu-sync-listener",
+      "namespace": "maintenance",
+      "name": "ariadne",
       "type": "ClusterIP",
       "selector": {
-        "app": "mailu-sync-listener"
+        "app": "ariadne"
       },
       "ports": [
         {
           "name": "http",
-          "port": 8080,
-          "targetPort": 8080,
+          "port": 80,
+          "targetPort": "http",
           "protocol": "TCP"
         }
       ]
@@ -1959,6 +2341,12 @@
           "port": 5432,
           "targetPort": 5432,
           "protocol": "TCP"
+        },
+        {
+          "name": "metrics",
+          "port": 9187,
+          "targetPort": 9187,
+          "protocol": "TCP"
         }
       ]
     },
@@ -2032,6 +2420,28 @@
         }
       ]
     },
+    {
+      "namespace": "traefik",
+      "name": "traefik",
+      "type": "LoadBalancer",
+      "selector": {
+        "app": "traefik"
+      },
+      "ports": [
+        {
+          "name": "web",
+          "port": 80,
+          "targetPort": "web",
+          "protocol": "TCP"
+        },
+        {
+          "name": "websecure",
+          "port": 443,
+          "targetPort": "websecure",
+          "protocol": "TCP"
+        }
+      ]
+    },
     {
       "namespace": "traefik",
       "name": "traefik-metrics",
@@ -2210,6 +2620,26 @@
         "source": "bstein-dev-home"
       }
     },
+    {
+      "host": "budget.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "actual-budget",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "actual-budget"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "actual-budget",
+        "source": "finance"
+      }
+    },
     {
       "host": "call.live.bstein.dev",
       "path": "/",
@@ -2290,6 +2720,26 @@
         "source": "nextcloud"
       }
     },
+    {
+      "host": "health.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "health",
+        "service": "wger",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "wger"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "wger",
+        "source": "health"
+      }
+    },
     {
       "host": "kit.live.bstein.dev",
       "path": "/livekit/jwt",
@@ -2385,6 +2835,106 @@
         "source": "comms"
       }
     },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/r0/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/login",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/logout",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/refresh",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-authentication-service",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-authentication-service"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
+    {
+      "host": "live.bstein.dev",
+      "path": "/_matrix/client/v3/register",
+      "backend": {
+        "namespace": "comms",
+        "service": "matrix-guest-register",
+        "port": 8080,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "matrix-guest-register"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "matrix-routing",
+        "source": "comms"
+      }
+    },
     {
       "host": "logs.bstein.dev",
       "path": "/",
@@ -2650,6 +3200,26 @@
         "source": "monerod"
       }
     },
+    {
+      "host": "money.bstein.dev",
+      "path": "/",
+      "backend": {
+        "namespace": "finance",
+        "service": "firefly",
+        "port": 80,
+        "workloads": [
+          {
+            "kind": "Deployment",
+            "name": "firefly"
+          }
+        ]
+      },
+      "via": {
+        "kind": "Ingress",
+        "name": "firefly",
+        "source": "finance"
+      }
+    },
     {
       "host": "notes.bstein.dev",
       "path": "/",
@@ -2838,7 +3408,6 @@
       "matrix.live.bstein.dev"
     ],
     "comms:comms/othrys-synapse": [
-      "bstein.dev",
       "kit.live.bstein.dev",
       "live.bstein.dev",
       "matrix.live.bstein.dev",
@@ -2853,6 +3422,9 @@
     "logging:logging/data-prepper": [
       "registry.bstein.dev"
     ],
+    "longhorn:longhorn-system/longhorn": [
+      "registry.bstein.dev"
+    ],
     "mailu:mailu-mailserver/mailu": [
       "bstein.dev",
       "mail.bstein.dev"
@@ -2862,8 +3434,12 @@
     ],
     "monitoring:monitoring/grafana": [
       "bstein.dev",
+      "mail.bstein.dev",
       "metrics.bstein.dev",
       "sso.bstein.dev"
+    ],
+    "monitoring:monitoring/kube-state-metrics": [
+      "atlas.bstein.dev"
     ]
   }
 }
diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml
index 67f2fcb2..b3b0119f 100644
--- a/services/comms/knowledge/catalog/atlas.yaml
+++ b/services/comms/knowledge/catalog/atlas.yaml
@@ -1,4 +1,4 @@
-# services/comms/knowledge/catalog/atlas.yaml
+# knowledge/catalog/atlas.yaml
 # Generated by scripts/knowledge_render_atlas.py (do not edit by hand)
 cluster: atlas
 sources:
@@ -8,6 +8,15 @@ sources:
 - name: bstein-dev-home
   path: services/bstein-dev-home
   targetNamespace: bstein-dev-home
+- name: bstein-dev-home-migrations
+  path: services/bstein-dev-home/migrations
+  targetNamespace: bstein-dev-home
+- name: cert-manager
+  path: infrastructure/cert-manager
+  targetNamespace: cert-manager
+- name: cert-manager-cleanup
+  path: infrastructure/cert-manager/cleanup
+  targetNamespace: cert-manager
 - name: comms
   path: services/comms
   targetNamespace: comms
@@ -17,6 +26,9 @@ sources:
 - name: crypto
   path: services/crypto
   targetNamespace: crypto
+- name: finance
+  path: services/finance
+  targetNamespace: finance
 - name: flux-system
   path: clusters/atlas/flux-system
   targetNamespace: null
@@ -29,6 +41,9 @@ sources:
 - name: harbor
   path: services/harbor
   targetNamespace: harbor
+- name: health
+  path: services/health
+  targetNamespace: health
 - name: helm
   path: infrastructure/sources/helm
   targetNamespace: flux-system
@@ -44,6 +59,12 @@ sources:
 - name: logging
   path: services/logging
   targetNamespace: null
+- name: longhorn
+  path: infrastructure/longhorn/core
+  targetNamespace: longhorn-system
+- name: longhorn-adopt
+  path: infrastructure/longhorn/adopt
+  targetNamespace: longhorn-system
 - name: longhorn-ui
   path: infrastructure/longhorn/ui-ingress
   targetNamespace: longhorn-system
@@ -98,9 +119,15 @@ sources:
 - name: vault-csi
   path: infrastructure/vault-csi
   targetNamespace: kube-system
+- name: vault-injector
+  path: infrastructure/vault-injector
+  targetNamespace: vault
 - name: vaultwarden
   path: services/vaultwarden
   targetNamespace: vaultwarden
+- name: wallet-monero-temp
+  path: services/crypto/wallet-monero-temp
+  targetNamespace: crypto
 - name: xmr-miner
   path: services/crypto/xmr-miner
   targetNamespace: crypto
@@ -124,7 +151,7 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-157
 - kind: Deployment
   namespace: bstein-dev-home
   name: bstein-dev-home-frontend
@@ -135,13 +162,22 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92
+  - registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-157
+- kind: Deployment
+  namespace: bstein-dev-home
+  name: bstein-dev-home-vault-sync
+  labels:
+    app: bstein-dev-home-vault-sync
+  serviceAccountName: bstein-dev-home-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: bstein-dev-home
   name: chat-ai-gateway
   labels:
     app: chat-ai-gateway
-  serviceAccountName: null
+  serviceAccountName: bstein-dev-home
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -157,12 +193,21 @@ workloads:
     hardware: rpi5
   images:
   - python:3.11-slim
+- kind: Deployment
+  namespace: comms
+  name: comms-vault-sync
+  labels:
+    app: comms-vault-sync
+  serviceAccountName: comms-vault
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: comms
   name: coturn
   labels:
     app: coturn
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -182,7 +227,7 @@ workloads:
   name: livekit
   labels:
     app: livekit
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -192,17 +237,17 @@ workloads:
   name: livekit-token-service
   labels:
     app: livekit-token-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
-  - ghcr.io/element-hq/lk-jwt-service:0.3.0
+  - registry.bstein.dev/tools/lk-jwt-service-vault:0.3.0
 - kind: Deployment
   namespace: comms
   name: matrix-authentication-service
   labels:
     app: matrix-authentication-service
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -212,7 +257,7 @@ workloads:
   name: matrix-guest-register
   labels:
     app.kubernetes.io/name: matrix-guest-register
-  serviceAccountName: null
+  serviceAccountName: comms-vault
   nodeSelector: {}
   images:
   - python:3.11-slim
@@ -235,12 +280,21 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
+- kind: Deployment
+  namespace: crypto
+  name: crypto-vault-sync
+  labels:
+    app: crypto-vault-sync
+  serviceAccountName: crypto-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: crypto
   name: monero-p2pool
   labels:
     app: monero-p2pool
-  serviceAccountName: null
+  serviceAccountName: crypto-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -255,6 +309,38 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - registry.bstein.dev/crypto/monerod:0.18.4.1
+- kind: Deployment
+  namespace: crypto
+  name: wallet-monero-temp
+  labels:
+    app: wallet-monero-temp
+  serviceAccountName: crypto-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/crypto/monero-wallet-rpc:0.18.4.1
+- kind: Deployment
+  namespace: finance
+  name: actual-budget
+  labels:
+    app: actual-budget
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - actualbudget/actual-server:26.1.0-alpine@sha256:34aae5813fdfee12af2a50c4d0667df68029f1d61b90f45f282473273eb70d0d
+- kind: Deployment
+  namespace: finance
+  name: firefly
+  labels:
+    app: firefly
+  serviceAccountName: finance-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - fireflyiii/core:version-6.4.15
 - kind: Deployment
   namespace: flux-system
   name: helm-controller
@@ -344,17 +430,38 @@ workloads:
   name: gitea
   labels:
     app: gitea
-  serviceAccountName: null
+  serviceAccountName: gitea-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
   - gitea/gitea:1.23
+- kind: Deployment
+  namespace: harbor
+  name: harbor-vault-sync
+  labels:
+    app: harbor-vault-sync
+  serviceAccountName: harbor-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
+- kind: Deployment
+  namespace: health
+  name: wger
+  labels:
+    app: wger
+  serviceAccountName: health-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - nginx:1.27.5-alpine@sha256:65645c7bb6a0661892a8b03b89d0743208a18dd2f3f17a54ef4b76fb8e2f2a10
+  - wger/server@sha256:710588b78af4e0aa0b4d8a8061e4563e16eae80eeaccfe7f9e0d9cbdd7f0cbc5
 - kind: Deployment
   namespace: jellyfin
   name: jellyfin
   labels:
     app: jellyfin
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector: {}
   images:
   - docker.io/jellyfin/jellyfin:10.11.5
@@ -363,13 +470,22 @@ workloads:
   name: pegasus
   labels:
     app: pegasus
-  serviceAccountName: null
+  serviceAccountName: pegasus-vault-sync
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
   - alpine:3.20
-  - registry.bstein.dev/streaming/pegasus:1.2.32
+  - registry.bstein.dev/streaming/pegasus-vault:1.2.32
+- kind: Deployment
+  namespace: jellyfin
+  name: pegasus-vault-sync
+  labels:
+    app: pegasus-vault-sync
+  serviceAccountName: pegasus-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: jenkins
   name: jenkins
@@ -381,6 +497,26 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - jenkins/jenkins:2.528.3-jdk21
+- kind: Deployment
+  namespace: jenkins
+  name: jenkins-vault-sync
+  labels:
+    app: jenkins-vault-sync
+  serviceAccountName: jenkins-vault-sync
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: kube-system
+  name: ntp-sync
+  labels:
+    app: ntp-sync
+  serviceAccountName: null
+  nodeSelector: {}
+  images:
+  - public.ecr.aws/docker/library/busybox:1.36.1
 - kind: DaemonSet
   namespace: kube-system
   name: nvidia-device-plugin-jetson
@@ -427,6 +563,16 @@ workloads:
     kubernetes.io/os: linux
   images:
   - hashicorp/vault-csi-provider:1.7.0
+- kind: Deployment
+  namespace: kube-system
+  name: coredns
+  labels:
+    k8s-app: kube-dns
+  serviceAccountName: coredns
+  nodeSelector:
+    kubernetes.io/os: linux
+  images:
+  - registry.bstein.dev/infra/coredns:1.12.1
 - kind: DaemonSet
   namespace: logging
   name: node-image-gc-rpi4
@@ -457,22 +603,41 @@ workloads:
     hardware: rpi5
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: logging
+  name: logging-vault-sync
+  labels:
+    app: logging-vault-sync
+  serviceAccountName: logging-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: logging
   name: oauth2-proxy-logs
   labels:
     app: oauth2-proxy-logs
-  serviceAccountName: null
+  serviceAccountName: logging-vault-sync
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: longhorn-system
+  name: longhorn-vault-sync
+  labels:
+    app: longhorn-vault-sync
+  serviceAccountName: longhorn-vault-sync
+  nodeSelector:
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: longhorn-system
   name: oauth2-proxy-longhorn
   labels:
     app: oauth2-proxy-longhorn
-  serviceAccountName: null
+  serviceAccountName: longhorn-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -489,13 +654,34 @@ workloads:
   - registry.bstein.dev/bstein/kubectl:1.35.0
 - kind: Deployment
   namespace: mailu-mailserver
-  name: mailu-sync-listener
+  name: mailu-vault-sync
   labels:
-    app: mailu-sync-listener
-  serviceAccountName: null
+    app: mailu-vault-sync
+  serviceAccountName: mailu-vault-sync
   nodeSelector: {}
   images:
-  - python:3.11-alpine
+  - alpine:3.20
+- kind: DaemonSet
+  namespace: maintenance
+  name: disable-k3s-traefik
+  labels:
+    app: disable-k3s-traefik
+  serviceAccountName: disable-k3s-traefik
+  nodeSelector:
+    node-role.kubernetes.io/control-plane: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: DaemonSet
+  namespace: maintenance
+  name: k3s-agent-restart
+  labels:
+    app: k3s-agent-restart
+  serviceAccountName: node-nofile
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
 - kind: DaemonSet
   namespace: maintenance
   name: node-image-sweeper
@@ -515,6 +701,26 @@ workloads:
   nodeSelector: {}
   images:
   - bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+- kind: Deployment
+  namespace: maintenance
+  name: ariadne
+  labels:
+    app: ariadne
+  serviceAccountName: ariadne
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
+  images:
+  - registry.bstein.dev/bstein/ariadne:0.1.0-48
+- kind: Deployment
+  namespace: maintenance
+  name: maintenance-vault-sync
+  labels:
+    app: maintenance-vault-sync
+  serviceAccountName: maintenance-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: DaemonSet
   namespace: monitoring
   name: dcgm-exporter
@@ -534,12 +740,21 @@ workloads:
     jetson: 'true'
   images:
   - python:3.10-slim
+- kind: Deployment
+  namespace: monitoring
+  name: monitoring-vault-sync
+  labels:
+    app: monitoring-vault-sync
+  serviceAccountName: monitoring-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: Deployment
   namespace: monitoring
   name: postmark-exporter
   labels:
     app: postmark-exporter
-  serviceAccountName: null
+  serviceAccountName: monitoring-vault-sync
   nodeSelector: {}
   images:
   - python:3.12-alpine
@@ -558,7 +773,7 @@ workloads:
   name: nextcloud
   labels:
     app: nextcloud
-  serviceAccountName: null
+  serviceAccountName: nextcloud-vault
   nodeSelector:
     hardware: rpi5
   images:
@@ -568,7 +783,7 @@ workloads:
   name: outline
   labels:
     app: outline
-  serviceAccountName: null
+  serviceAccountName: outline-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -588,7 +803,7 @@ workloads:
   name: planka
   labels:
     app: planka
-  serviceAccountName: null
+  serviceAccountName: planka-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
@@ -603,13 +818,16 @@ workloads:
     node-role.kubernetes.io/worker: 'true'
   images:
   - postgres:15
+  - quay.io/prometheuscommunity/postgres-exporter:v0.15.0
 - kind: Deployment
   namespace: sso
   name: keycloak
   labels:
     app: keycloak
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: sso-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
   - quay.io/keycloak/keycloak:26.0.7
 - kind: Deployment
@@ -617,17 +835,26 @@ workloads:
   name: oauth2-proxy
   labels:
     app: oauth2-proxy
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
   images:
-  - quay.io/oauth2-proxy/oauth2-proxy:v7.6.0
+  - registry.bstein.dev/tools/oauth2-proxy-vault:v7.6.0
+- kind: Deployment
+  namespace: sso
+  name: sso-vault-sync
+  labels:
+    app: sso-vault-sync
+  serviceAccountName: sso-vault-sync
+  nodeSelector: {}
+  images:
+  - alpine:3.20
 - kind: StatefulSet
   namespace: sso
   name: openldap
   labels:
     app: openldap
-  serviceAccountName: null
+  serviceAccountName: sso-vault
   nodeSelector:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
@@ -640,7 +867,7 @@ workloads:
     app: sui-metrics
   serviceAccountName: sui-metrics
   nodeSelector:
-    kubernetes.io/hostname: titan-24
+    hardware: rpi5
   images:
   - victoriametrics/vmagent:v1.103.0
 - kind: Deployment
@@ -648,6 +875,8 @@ workloads:
   name: traefik
   labels:
     app: traefik
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
   serviceAccountName: traefik-ingress-controller
   nodeSelector:
     node-role.kubernetes.io/worker: 'true'
@@ -669,8 +898,10 @@ workloads:
   name: vaultwarden
   labels:
     app: vaultwarden
-  serviceAccountName: null
-  nodeSelector: {}
+  serviceAccountName: vaultwarden-vault
+  nodeSelector:
+    kubernetes.io/arch: arm64
+    node-role.kubernetes.io/worker: 'true'
   images:
   - vaultwarden/server:1.35.2
 services:
@@ -1040,6 +1271,36 @@ services:
     port: 3333
     targetPort: 3333
     protocol: TCP
+- namespace: crypto
+  name: wallet-monero-temp
+  type: ClusterIP
+  selector:
+    app: wallet-monero-temp
+  ports:
+  - name: rpc
+    port: 18083
+    targetPort: 18083
+    protocol: TCP
+- namespace: finance
+  name: actual-budget
+  type: ClusterIP
+  selector:
+    app: actual-budget
+  ports:
+  - name: http
+    port: 80
+    targetPort: 5006
+    protocol: TCP
+- namespace: finance
+  name: firefly
+  type: ClusterIP
+  selector:
+    app: firefly
+  ports:
+  - name: http
+    port: 80
+    targetPort: 8080
+    protocol: TCP
 - namespace: flux-system
   name: notification-controller
   type: ClusterIP
@@ -1082,7 +1343,7 @@ services:
     protocol: TCP
 - namespace: gitea
   name: gitea-ssh
-  type: NodePort
+  type: LoadBalancer
   selector:
     app: gitea
   ports:
@@ -1090,6 +1351,16 @@ services:
     port: 2242
     targetPort: 2242
     protocol: TCP
+- namespace: health
+  name: wger
+  type: ClusterIP
+  selector:
+    app: wger
+  ports:
+  - name: http
+    port: 80
+    targetPort: http
+    protocol: TCP
 - namespace: jellyfin
   name: jellyfin
   type: ClusterIP
@@ -1124,21 +1395,6 @@ services:
     port: 50000
     targetPort: 50000
     protocol: TCP
-- namespace: kube-system
-  name: traefik
-  type: LoadBalancer
-  selector:
-    app.kubernetes.io/instance: traefik-kube-system
-    app.kubernetes.io/name: traefik
-  ports:
-  - name: web
-    port: 80
-    targetPort: web
-    protocol: TCP
-  - name: websecure
-    port: 443
-    targetPort: websecure
-    protocol: TCP
 - namespace: logging
   name: oauth2-proxy-logs
   type: ClusterIP
@@ -1191,15 +1447,15 @@ services:
     port: 4190
     targetPort: 4190
     protocol: TCP
-- namespace: mailu-mailserver
-  name: mailu-sync-listener
+- namespace: maintenance
+  name: ariadne
   type: ClusterIP
   selector:
-    app: mailu-sync-listener
+    app: ariadne
   ports:
   - name: http
-    port: 8080
-    targetPort: 8080
+    port: 80
+    targetPort: http
     protocol: TCP
 - namespace: monitoring
   name: dcgm-exporter
@@ -1291,6 +1547,10 @@ services:
     port: 5432
     targetPort: 5432
     protocol: TCP
+  - name: metrics
+    port: 9187
+    targetPort: 9187
+    protocol: TCP
 - namespace: sso
   name: keycloak
   type: ClusterIP
@@ -1335,6 +1595,20 @@ services:
     port: 8429
     targetPort: 8429
     protocol: TCP
+- namespace: traefik
+  name: traefik
+  type: LoadBalancer
+  selector:
+    app: traefik
+  ports:
+  - name: web
+    port: 80
+    targetPort: web
+    protocol: TCP
+  - name: websecure
+    port: 443
+    targetPort: websecure
+    protocol: TCP
 - namespace: traefik
   name: traefik-metrics
   type: ClusterIP
@@ -1447,6 +1721,19 @@ http_endpoints:
     kind: Ingress
     name: bstein-dev-home
     source: bstein-dev-home
+- host: budget.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: actual-budget
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: actual-budget
+  via:
+    kind: Ingress
+    name: actual-budget
+    source: finance
 - host: call.live.bstein.dev
   path: /
   backend:
@@ -1499,6 +1786,19 @@ http_endpoints:
     kind: Ingress
     name: nextcloud
     source: nextcloud
+- host: health.bstein.dev
+  path: /
+  backend:
+    namespace: health
+    service: wger
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: wger
+  via:
+    kind: Ingress
+    name: wger
+    source: health
 - host: kit.live.bstein.dev
   path: /livekit/jwt
   backend:
@@ -1558,6 +1858,65 @@ http_endpoints:
     kind: Ingress
     name: matrix-routing
     source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/r0/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: &id003
+    - kind: Deployment
+      name: matrix-guest-register
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/login
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: &id002
+    - kind: Deployment
+      name: matrix-authentication-service
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/logout
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/refresh
+  backend:
+    namespace: comms
+    service: matrix-authentication-service
+    port: 8080
+    workloads: *id002
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
+- host: live.bstein.dev
+  path: /_matrix/client/v3/register
+  backend:
+    namespace: comms
+    service: matrix-guest-register
+    port: 8080
+    workloads: *id003
+  via:
+    kind: Ingress
+    name: matrix-routing
+    source: comms
 - host: logs.bstein.dev
   path: /
   backend:
@@ -1601,9 +1960,7 @@ http_endpoints:
     namespace: comms
     service: matrix-authentication-service
     port: 8080
-    workloads: &id002
-    - kind: Deployment
-      name: matrix-authentication-service
+    workloads: *id002
   via:
     kind: Ingress
     name: matrix-routing
@@ -1647,9 +2004,7 @@ http_endpoints:
     namespace: comms
     service: matrix-guest-register
     port: 8080
-    workloads: &id003
-    - kind: Deployment
-      name: matrix-guest-register
+    workloads: *id003
   via:
     kind: Ingress
     name: matrix-routing
@@ -1722,6 +2077,19 @@ http_endpoints:
     kind: Ingress
     name: monerod
     source: monerod
+- host: money.bstein.dev
+  path: /
+  backend:
+    namespace: finance
+    service: firefly
+    port: 80
+    workloads:
+    - kind: Deployment
+      name: firefly
+  via:
+    kind: Ingress
+    name: firefly
+    source: finance
 - host: notes.bstein.dev
   path: /
   backend:
@@ -1845,7 +2213,6 @@ helmrelease_host_hints:
   - live.bstein.dev
   - matrix.live.bstein.dev
   comms:comms/othrys-synapse:
-  - bstein.dev
   - kit.live.bstein.dev
   - live.bstein.dev
   - matrix.live.bstein.dev
@@ -1856,6 +2223,8 @@ helmrelease_host_hints:
   - registry.bstein.dev
   logging:logging/data-prepper:
   - registry.bstein.dev
+  longhorn:longhorn-system/longhorn:
+  - registry.bstein.dev
   mailu:mailu-mailserver/mailu:
   - bstein.dev
   - mail.bstein.dev
@@ -1863,5 +2232,8 @@ helmrelease_host_hints:
   - alerts.bstein.dev
   monitoring:monitoring/grafana:
   - bstein.dev
+  - mail.bstein.dev
   - metrics.bstein.dev
   - sso.bstein.dev
+  monitoring:monitoring/kube-state-metrics:
+  - atlas.bstein.dev
diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json
index d7356ca5..0718562b 100644
--- a/services/comms/knowledge/catalog/runbooks.json
+++ b/services/comms/knowledge/catalog/runbooks.json
@@ -20,6 +20,22 @@
     ],
     "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured."
   },
+  {
+    "path": "runbooks/comms-verify.md",
+    "title": "Othrys verification checklist",
+    "tags": [
+      "comms",
+      "matrix",
+      "element",
+      "livekit"
+    ],
+    "entrypoints": [
+      "https://live.bstein.dev",
+      "https://matrix.live.bstein.dev"
+    ],
+    "source_paths": [],
+    "body": "1) Guest join:\n- Open a private window and visit:\n  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN."
+  },
   {
     "path": "runbooks/kb-authoring.md",
     "title": "KB authoring: what to write (and what not to)",
diff --git a/services/comms/knowledge/diagrams/atlas-http.mmd b/services/comms/knowledge/diagrams/atlas-http.mmd
index ab7c3621..1aa7ac80 100644
--- a/services/comms/knowledge/diagrams/atlas-http.mmd
+++ b/services/comms/knowledge/diagrams/atlas-http.mmd
@@ -17,6 +17,11 @@ flowchart LR
   host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
   wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
   svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
+  host_budget_bstein_dev["budget.bstein.dev"]
+  svc_finance_actual_budget["finance/actual-budget (Service)"]
+  host_budget_bstein_dev --> svc_finance_actual_budget
+  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
+  svc_finance_actual_budget --> wl_finance_actual_budget
   host_call_live_bstein_dev["call.live.bstein.dev"]
   svc_comms_element_call["comms/element-call (Service)"]
   host_call_live_bstein_dev --> svc_comms_element_call
@@ -37,6 +42,11 @@ flowchart LR
   host_cloud_bstein_dev --> svc_nextcloud_nextcloud
   wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
   svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
+  host_health_bstein_dev["health.bstein.dev"]
+  svc_health_wger["health/wger (Service)"]
+  host_health_bstein_dev --> svc_health_wger
+  wl_health_wger["health/wger (Deployment)"]
+  svc_health_wger --> wl_health_wger
   host_kit_live_bstein_dev["kit.live.bstein.dev"]
   svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
   host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@@ -50,6 +60,14 @@ flowchart LR
   host_live_bstein_dev --> svc_comms_matrix_wellknown
   svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
   host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_logs_bstein_dev["logs.bstein.dev"]
   svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
   host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
@@ -64,21 +82,20 @@ flowchart LR
   svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
   host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
   host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
-  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
-  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
-  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
   host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
   host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
   host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
-  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
-  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
   host_monero_bstein_dev["monero.bstein.dev"]
   svc_crypto_monerod["crypto/monerod (Service)"]
   host_monero_bstein_dev --> svc_crypto_monerod
   wl_crypto_monerod["crypto/monerod (Deployment)"]
   svc_crypto_monerod --> wl_crypto_monerod
+  host_money_bstein_dev["money.bstein.dev"]
+  svc_finance_firefly["finance/firefly (Service)"]
+  host_money_bstein_dev --> svc_finance_firefly
+  wl_finance_firefly["finance/firefly (Deployment)"]
+  svc_finance_firefly --> wl_finance_firefly
   host_notes_bstein_dev["notes.bstein.dev"]
   svc_outline_outline["outline/outline (Service)"]
   host_notes_bstein_dev --> svc_outline_outline
@@ -143,19 +160,29 @@ flowchart LR
     svc_comms_livekit
     wl_comms_livekit
     svc_comms_othrys_synapse_matrix_synapse
-    svc_comms_matrix_authentication_service
-    wl_comms_matrix_authentication_service
     svc_comms_matrix_guest_register
     wl_comms_matrix_guest_register
+    svc_comms_matrix_authentication_service
+    wl_comms_matrix_authentication_service
   end
   subgraph crypto[crypto]
     svc_crypto_monerod
     wl_crypto_monerod
   end
+  subgraph finance[finance]
+    svc_finance_actual_budget
+    wl_finance_actual_budget
+    svc_finance_firefly
+    wl_finance_firefly
+  end
   subgraph gitea[gitea]
     svc_gitea_gitea
     wl_gitea_gitea
   end
+  subgraph health[health]
+    svc_health_wger
+    wl_health_wger
+  end
   subgraph jellyfin[jellyfin]
     svc_jellyfin_pegasus
     wl_jellyfin_pegasus
diff --git a/services/comms/knowledge/metis.md b/services/comms/knowledge/metis.md
new file mode 100644
index 00000000..5b0d06be
--- /dev/null
+++ b/services/comms/knowledge/metis.md
@@ -0,0 +1,26 @@
+# Metis (node recovery)
+
+## Node classes (current map)
+- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
+- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
+- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
+- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
+- amd64 agents: titan-22/24 (Debian 13, k3s agent)
+- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
+
+## Longhorn disk UUIDs (critical nodes)
+- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
+- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
+- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
+- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
+
+## Metis repo (~/Development/metis)
+- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
+- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
+- `AGENTS.md` in repo is untracked and holds raw notes.
+
+## Next implementation steps
+- Add per-class golden image refs and checksums (Harbor or file://) when ready.
+- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
+- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
+- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
diff --git a/services/comms/knowledge/runbooks/comms-verify.md b/services/comms/knowledge/runbooks/comms-verify.md
new file mode 100644
index 00000000..8c09d0af
--- /dev/null
+++ b/services/comms/knowledge/runbooks/comms-verify.md
@@ -0,0 +1,30 @@
+---
+title: Othrys verification checklist
+tags:
+  - comms
+  - matrix
+  - element
+  - livekit
+entrypoints:
+  - https://live.bstein.dev
+  - https://matrix.live.bstein.dev
+---
+
+1) Guest join:
+- Open a private window and visit:
+  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
+- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
+
+2) Keycloak login:
+- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
+
+3) Video rooms:
+- Start an Element Call room and confirm audio/video with a second account.
+- Check that guests can read public rooms but cannot start calls.
+
+4) Well-known:
+- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
+- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
+
+5) TURN reachability:
+- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.
diff --git a/services/comms/knowledge/software/metis.md b/services/comms/knowledge/software/metis.md
new file mode 100644
index 00000000..7ca3b399
--- /dev/null
+++ b/services/comms/knowledge/software/metis.md
@@ -0,0 +1,73 @@
+# Metis (node recovery)
+
+## Node classes (current map)
+- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
+- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
+- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
+- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
+- amd64 agents: titan-22/24 (Debian 13, k3s agent)
+- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
+
+### Jetson nodes (titan-20/21)
+- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
+- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
+- k3s agent with drop-in 99-nofile.conf.
+
+## Longhorn disk UUIDs (critical nodes)
+- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
+- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
+- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
+- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
+
+## Metis repo (~/Development/metis)
+- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
+- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
+- `AGENTS.md` in repo is untracked and holds raw notes.
+
+## Next implementation steps
+- Add per-class golden image refs and checksums (Harbor or file://) when ready.
+- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
+- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
+- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
+
+## Node OS/Kernel/CRI snapshot (Jan 2026)
+- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
+- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
+
+
+### External hosts
+- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
+- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
+- titan-23/oceanus: TODO audit (future).
+
+
+### Control plane Pis (titan-0a/0b/0c)
+- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
+- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
+- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
+
+
+## k3s versions
+- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
+- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
+- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 6fb6bff0..e0776203 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -19,6 +19,8 @@ API_KEY = os.environ.get("CHAT_API_KEY", "")
 
 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
+ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
+ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
 
 BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
 SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
@@ -297,6 +299,21 @@ def k8s_get(path: str, timeout: int = 8) -> dict:
         raw = resp.read()
         return json.loads(raw.decode()) if raw else {}
 
+def _ariadne_state(timeout: int = 5) -> dict | None:
+    if not ARIADNE_STATE_URL:
+        return None
+    headers = {}
+    if ARIADNE_STATE_TOKEN:
+        headers["X-Internal-Token"] = ARIADNE_STATE_TOKEN
+    r = request.Request(ARIADNE_STATE_URL, headers=headers, method="GET")
+    try:
+        with request.urlopen(r, timeout=timeout) as resp:
+            raw = resp.read()
+            payload = json.loads(raw.decode()) if raw else {}
+            return payload if isinstance(payload, dict) else None
+    except Exception:
+        return None
+
 def k8s_pods(namespace: str) -> list[dict]:
     data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500")
     items = data.get("items") or []
@@ -445,6 +462,17 @@ def vm_cluster_snapshot() -> str:
     return "\n".join(parts).strip()
 
 def nodes_summary(cluster_name: str) -> str:
+    state = _ariadne_state()
+    if state:
+        nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
+        total = nodes.get("total")
+        ready = nodes.get("ready")
+        not_ready = nodes.get("not_ready")
+        if isinstance(total, int) and isinstance(ready, int):
+            not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0)
+            if not_ready:
+                return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
+            return f"{cluster_name} cluster has {total} nodes, all Ready."
     try:
         data = k8s_get("/api/v1/nodes?limit=500")
     except Exception:
@@ -467,6 +495,16 @@ def nodes_summary(cluster_name: str) -> str:
     return f"{cluster_name} cluster has {total} nodes, all Ready."
 
 def nodes_names_summary(cluster_name: str) -> str:
+    state = _ariadne_state()
+    if state:
+        nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
+        names = nodes.get("names")
+        if isinstance(names, list) and names:
+            cleaned = sorted({str(n) for n in names if n})
+            if len(cleaned) <= 30:
+                return f"{cluster_name} node names: {', '.join(cleaned)}."
+            shown = ", ".join(cleaned[:30])
+            return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)."
     try:
         data = k8s_get("/api/v1/nodes?limit=500")
     except Exception:
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 52d10f96..0356e060 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -311,10 +311,18 @@ spec:
               value: "0 0 1 1 *"
             - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
               value: "*/10 * * * *"
+            - name: ARIADNE_SCHEDULE_CLUSTER_STATE
+              value: "*/15 * * * *"
+            - name: ARIADNE_CLUSTER_STATE_KEEP
+              value: "168"
             - name: WELCOME_EMAIL_ENABLED
               value: "true"
             - name: K8S_API_TIMEOUT_SEC
               value: "5"
+            - name: ARIADNE_VM_URL
+              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
+            - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
+              value: "5"
             - name: OPENSEARCH_URL
               value: http://opensearch-master.logging.svc.cluster.local:9200
             - name: OPENSEARCH_LIMIT_BYTES
diff --git a/services/maintenance/ariadne-rbac.yaml b/services/maintenance/ariadne-rbac.yaml
index 88689cb6..33620d05 100644
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@@ -21,12 +21,27 @@ rules:
       - list
       - watch
       - delete
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - namespaces
+    verbs:
+      - get
+      - list
+      - watch
   - apiGroups: [""]
     resources:
       - pods/exec
     verbs:
       - get
       - create
+  - apiGroups: ["kustomize.toolkit.fluxcd.io"]
+    resources:
+      - kustomizations
+    verbs:
+      - get
+      - list
+      - watch
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1

From 733f420b9a3c1506b2e528265b65d6f3ff3b2a3f Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Mon, 26 Jan 2026 06:33:26 +0000
Subject: [PATCH 230/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 4e261cbf..3933caf6 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-48 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-49 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From a3b84a36fd525971197036545b285840c806a3a8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 09:23:21 -0300
Subject: [PATCH 231/416] comms: inject chat ai keys for atlasbot

---
 services/comms/atlasbot-deployment.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 70844ebf..aec7b790 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -25,6 +25,12 @@ spec:
         vault.hashicorp.com/agent-inject-secret-livekit-primary: "kv/data/atlas/comms/livekit-api"
         vault.hashicorp.com/agent-inject-template-livekit-primary: |
           {{- with secret "kv/data/atlas/comms/livekit-api" -}}{{ .Data.data.primary }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-chat-matrix: "kv/data/atlas/shared/chat-ai-keys-runtime"
+        vault.hashicorp.com/agent-inject-template-chat-matrix: |
+          {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.matrix }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-chat-homepage: "kv/data/atlas/shared/chat-ai-keys-runtime"
+        vault.hashicorp.com/agent-inject-template-chat-homepage: |
+          {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.homepage }}{{- end -}}
         vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
         vault.hashicorp.com/agent-inject-template-bot-pass: |
           {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}

From 80e059f6bb650653a92baf9a7c49e705dae64602 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 09:29:28 -0300
Subject: [PATCH 232/416] comms: fix duplicate chat key annotations

---
 services/comms/atlasbot-deployment.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index aec7b790..70844ebf 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -25,12 +25,6 @@ spec:
         vault.hashicorp.com/agent-inject-secret-livekit-primary: "kv/data/atlas/comms/livekit-api"
         vault.hashicorp.com/agent-inject-template-livekit-primary: |
           {{- with secret "kv/data/atlas/comms/livekit-api" -}}{{ .Data.data.primary }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-chat-matrix: "kv/data/atlas/shared/chat-ai-keys-runtime"
-        vault.hashicorp.com/agent-inject-template-chat-matrix: |
-          {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.matrix }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-chat-homepage: "kv/data/atlas/shared/chat-ai-keys-runtime"
-        vault.hashicorp.com/agent-inject-template-chat-homepage: |
-          {{- with secret "kv/data/atlas/shared/chat-ai-keys-runtime" -}}{{ .Data.data.homepage }}{{- end -}}
         vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
         vault.hashicorp.com/agent-inject-template-bot-pass: |
           {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}

From 301fdb49173cc7fff04526bf318360aac679146b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 09:36:08 -0300
Subject: [PATCH 233/416] comms: handle arch node counts and extend LLM timeout

---
 services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e0776203..797b601d 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -16,6 +16,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev"
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
+OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90"))
 
 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
@@ -525,6 +526,29 @@ def nodes_names_summary(cluster_name: str) -> str:
     shown = ", ".join(names[:30])
     return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."
 
+
+def nodes_arch_summary(cluster_name: str, arch: str) -> str:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return ""
+    items = data.get("items") or []
+    if not isinstance(items, list) or not items:
+        return ""
+    normalized = (arch or "").strip().lower()
+    if normalized in ("aarch64", "arm64"):
+        arch_label = "arm64"
+    elif normalized in ("x86_64", "x86-64", "amd64"):
+        arch_label = "amd64"
+    else:
+        arch_label = normalized
+    total = 0
+    for node in items:
+        labels = (node.get("metadata") or {}).get("labels") or {}
+        if labels.get("kubernetes.io/arch") == arch_label:
+            total += 1
+    return f"{cluster_name} cluster has {total} {arch_label} nodes."
+
 def _strip_code_fence(text: str) -> str:
     cleaned = (text or "").strip()
     match = CODE_FENCE_RE.match(cleaned)
@@ -622,7 +646,7 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
         if API_KEY:
             headers["x-api-key"] = API_KEY
         r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-        with request.urlopen(r, timeout=20) as resp:
+        with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
             data = json.loads(resp.read().decode())
             raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
             reply = _normalize_reply(raw_reply) or "I'm here to help."
@@ -692,6 +716,19 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
+                if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")):
+                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
+                        arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64"
+                        summary = nodes_arch_summary("Atlas", arch)
+                        if not summary:
+                            send_msg(
+                                token,
+                                rid,
+                                "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.",
+                            )
+                            continue
+                        send_msg(token, rid, summary)
+                        continue
                 if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body):
                     if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                         names_summary = nodes_names_summary("Atlas")

From 9b09b939215992f40b3a8b0a7ec8666431843797 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 09:38:38 -0300
Subject: [PATCH 234/416] comms: bump atlasbot configmap checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 70844ebf..a8a30092 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-9
+        checksum/atlasbot-configmap: manual-atlasbot-10
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 6c84cf60c60a662e36a6ff986dfe23e2da93f2ee Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 11:44:28 -0300
Subject: [PATCH 235/416] ai-llm: tighten gpu placement and resources

---
 services/ai-llm/deployment.yaml | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml
index dfa1bdd1..4f34d866 100644
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@@ -21,8 +21,8 @@ spec:
         app: ollama
       annotations:
         ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
-        ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
-        ai.bstein.dev/restartedAt: "2026-01-25T19:10:00Z"
+        ai.bstein.dev/gpu: GPU pool (titan-22/24)
+        ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
     spec:
       affinity:
         nodeAffinity:
@@ -32,8 +32,6 @@ spec:
                   - key: kubernetes.io/hostname
                     operator: In
                     values:
-                      - titan-20
-                      - titan-21
                       - titan-22
                       - titan-24
       runtimeClassName: nvidia
@@ -69,8 +67,8 @@ spec:
               mountPath: /root/.ollama
           resources:
             requests:
-              cpu: 250m
-              memory: 1Gi
+              cpu: 500m
+              memory: 2Gi
               nvidia.com/gpu.shared: 1
             limits:
               nvidia.com/gpu.shared: 1
@@ -97,10 +95,10 @@ spec:
               mountPath: /root/.ollama
           resources:
             requests:
-              cpu: "2"
-              memory: 8Gi
+              cpu: "4"
+              memory: 16Gi
               nvidia.com/gpu.shared: 1
             limits:
-              cpu: "4"
-              memory: 12Gi
+              cpu: "8"
+              memory: 24Gi
               nvidia.com/gpu.shared: 1

From b5e8192731053cb6bfcfde861f729f01f75af7af Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 12:06:48 -0300
Subject: [PATCH 236/416] atlasbot: answer jetson nodes from knowledge

---
 knowledge/catalog/atlas.json                  |  2 +-
 knowledge/catalog/atlas.yaml                  |  2 +-
 knowledge/catalog/runbooks.json               |  8 ++++
 scripts/knowledge_render_atlas.py             |  9 ++++-
 services/comms/knowledge/catalog/atlas.json   |  2 +-
 services/comms/knowledge/catalog/atlas.yaml   |  2 +-
 .../comms/knowledge/catalog/runbooks.json     |  8 ++++
 services/comms/scripts/atlasbot/bot.py        | 39 +++++++++++++++++++
 8 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/knowledge/catalog/atlas.json b/knowledge/catalog/atlas.json
index 21ac4073..951c8079 100644
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
@@ -1057,7 +1057,7 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/ariadne:0.1.0-48"
+        "registry.bstein.dev/bstein/ariadne:0.1.0-49"
       ]
     },
     {
diff --git a/knowledge/catalog/atlas.yaml b/knowledge/catalog/atlas.yaml
index b3b0119f..637b5f97 100644
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
@@ -711,7 +711,7 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/ariadne:0.1.0-48
+  - registry.bstein.dev/bstein/ariadne:0.1.0-49
 - kind: Deployment
   namespace: maintenance
   name: maintenance-vault-sync
diff --git a/knowledge/catalog/runbooks.json b/knowledge/catalog/runbooks.json
index 0718562b..960510d2 100644
--- a/knowledge/catalog/runbooks.json
+++ b/knowledge/catalog/runbooks.json
@@ -85,5 +85,13 @@
       "clusters/atlas/<...>"
     ],
     "body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
+  },
+  {
+    "path": "software/metis.md",
+    "title": "metis",
+    "tags": [],
+    "entrypoints": [],
+    "source_paths": [],
+    "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2"
   }
 ]
diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py
index 34938e74..206dcd90 100644
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@@ -529,9 +529,14 @@ def main() -> int:
     diagram_path.write_text(diagram, encoding="utf-8")
 
     # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
-    runbooks_dir = out_dir / "runbooks"
+    runbook_dirs = [
+        out_dir / "runbooks",
+        out_dir / "software",
+    ]
     runbooks: list[dict[str, Any]] = []
-    if runbooks_dir.exists():
+    for runbooks_dir in runbook_dirs:
+        if not runbooks_dir.exists():
+            continue
         for md_file in sorted(runbooks_dir.glob("*.md")):
             raw = md_file.read_text(encoding="utf-8")
             fm: dict[str, Any] = {}
diff --git a/services/comms/knowledge/catalog/atlas.json b/services/comms/knowledge/catalog/atlas.json
index 21ac4073..951c8079 100644
--- a/services/comms/knowledge/catalog/atlas.json
+++ b/services/comms/knowledge/catalog/atlas.json
@@ -1057,7 +1057,7 @@
         "node-role.kubernetes.io/worker": "true"
       },
       "images": [
-        "registry.bstein.dev/bstein/ariadne:0.1.0-48"
+        "registry.bstein.dev/bstein/ariadne:0.1.0-49"
       ]
     },
     {
diff --git a/services/comms/knowledge/catalog/atlas.yaml b/services/comms/knowledge/catalog/atlas.yaml
index b3b0119f..637b5f97 100644
--- a/services/comms/knowledge/catalog/atlas.yaml
+++ b/services/comms/knowledge/catalog/atlas.yaml
@@ -711,7 +711,7 @@ workloads:
     kubernetes.io/arch: arm64
     node-role.kubernetes.io/worker: 'true'
   images:
-  - registry.bstein.dev/bstein/ariadne:0.1.0-48
+  - registry.bstein.dev/bstein/ariadne:0.1.0-49
 - kind: Deployment
   namespace: maintenance
   name: maintenance-vault-sync
diff --git a/services/comms/knowledge/catalog/runbooks.json b/services/comms/knowledge/catalog/runbooks.json
index 0718562b..960510d2 100644
--- a/services/comms/knowledge/catalog/runbooks.json
+++ b/services/comms/knowledge/catalog/runbooks.json
@@ -85,5 +85,13 @@
       "clusters/atlas/<...>"
     ],
     "body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
+  },
+  {
+    "path": "software/metis.md",
+    "title": "metis",
+    "tags": [],
+    "entrypoints": [],
+    "source_paths": [],
+    "body": "# Metis (node recovery)\n\n## Node classes (current map)\n- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)\n- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)\n- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)\n- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)\n- amd64 agents: titan-22/24 (Debian 13, k3s agent)\n- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.\n\n### Jetson nodes (titan-20/21)\n- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.\n- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).\n- k3s agent with drop-in 99-nofile.conf.\n\n## Longhorn disk UUIDs (critical nodes)\n- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)\n- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)\n- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)\n- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)\n\n## Metis repo (~/Development/metis)\n- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).\n- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).\n- `AGENTS.md` in repo is untracked and holds raw notes.\n\n## Next implementation steps\n- Add per-class golden image refs and checksums (Harbor or file://) when ready.\n- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.\n- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.\n- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.\n\n## Node OS/Kernel/CRI snapshot (Jan 2026)\n- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64\n- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64\n- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64\n\n\n### External hosts\n- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.\n- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).\n- titan-23/oceanus: TODO audit (future).\n\n\n### Control plane Pis (titan-0a/0b/0c)\n- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.\n- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.\n- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).\n\n\n## k3s versions\n- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)\n- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)\n- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2"
   }
 ]
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 797b601d..18ec611a 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -75,6 +75,8 @@ METRIC_HINT_WORDS = {
 }
 
 CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL)
+TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE)
+TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE)
 
 def _tokens(text: str) -> list[str]:
     toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
@@ -233,6 +235,35 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
         used += len(chunk)
     return "\n".join(parts).strip()
 
+def _extract_titan_nodes(text: str) -> list[str]:
+    names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n}
+    for match in TITAN_RANGE_RE.finditer(text or ""):
+        left, right = match.groups()
+        if left:
+            names.add(f"titan-{left.lower()}")
+        if right:
+            names.add(f"titan-{right.lower()}")
+    return sorted(names)
+
+def jetson_nodes_from_kb() -> list[str]:
+    for doc in KB.get("runbooks", []):
+        if not isinstance(doc, dict):
+            continue
+        body = str(doc.get("body") or "")
+        for line in body.splitlines():
+            if "jetson" not in line.lower():
+                continue
+            names = _extract_titan_nodes(line)
+            if names:
+                return names
+    return []
+
+def jetson_nodes_summary(cluster_name: str) -> str:
+    names = jetson_nodes_from_kb()
+    if names:
+        return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
+    return ""
+
 def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
     q = (query or "").strip()
     if not q or not KB.get("catalog"):
@@ -729,6 +760,14 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
+                if "jetson" in lower_body:
+                    if any(word in lower_body for word in ("cluster", "atlas", "titan", "node", "nodes")):
+                        summary = jetson_nodes_summary("Atlas")
+                        if summary:
+                            send_msg(token, rid, summary)
+                        else:
+                            send_msg(token, rid, "Jetson inventory is not available in the knowledge base yet.")
+                        continue
                 if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body):
                     if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                         names_summary = nodes_names_summary("Atlas")

From a1494a75218ef893dc5b331b025957a249cb4a4f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 12:08:33 -0300
Subject: [PATCH 237/416] comms: bump atlasbot config checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index a8a30092..c96c79c4 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-10
+        checksum/atlasbot-configmap: manual-atlasbot-11
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From e7c3d25dfcbf97b06053fb42374569ca08a7e14e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 12:36:51 -0300
Subject: [PATCH 238/416] atlasbot: ground node inventory and soften llm
 failures

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 216 ++++++++++++++++++++----
 2 files changed, 181 insertions(+), 37 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index c96c79c4..2c08853d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-11
+        checksum/atlasbot-configmap: manual-atlasbot-12
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 18ec611a..8edc28dc 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3,6 +3,7 @@ import json
 import os
 import re
 import ssl
+import threading
 import time
 from typing import Any
 from urllib import error, parse, request
@@ -156,6 +157,13 @@ def send_msg(token: str, room: str, text: str):
 KB = {"catalog": {}, "runbooks": []}
 _HOST_INDEX: dict[str, list[dict]] = {}
 _NAME_INDEX: set[str] = set()
+_NODE_CLASS_INDEX: dict[str, list[str]] = {}
+_NODE_CLASS_RPI4: set[str] = set()
+_NODE_CLASS_RPI5: set[str] = set()
+_NODE_CLASS_AMD64: set[str] = set()
+_NODE_CLASS_JETSON: set[str] = set()
+_NODE_CLASS_EXTERNAL: set[str] = set()
+_NODE_CLASS_NON_RPI: set[str] = set()
 
 def _load_json_file(path: str) -> Any | None:
     try:
@@ -166,6 +174,8 @@ def _load_json_file(path: str) -> Any | None:
 
 def load_kb():
     global KB, _HOST_INDEX, _NAME_INDEX
+    global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON
+    global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI
     if not KB_DIR:
         return
     catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {}
@@ -188,6 +198,24 @@ def load_kb():
             names.add(str(w["name"]).lower())
     _NAME_INDEX = names
 
+    node_classes = _parse_node_classes(runbooks)
+    _NODE_CLASS_INDEX = node_classes
+    _NODE_CLASS_RPI4 = set(node_classes.get("rpi4", []))
+    _NODE_CLASS_RPI5 = set(node_classes.get("rpi5", []))
+    _NODE_CLASS_AMD64 = set(node_classes.get("amd64", []))
+    _NODE_CLASS_JETSON = set(node_classes.get("jetson", []))
+    _NODE_CLASS_EXTERNAL = set(node_classes.get("external", []))
+    _NODE_CLASS_NON_RPI = set(
+        sorted(
+            (
+                set().union(*node_classes.values())
+                - _NODE_CLASS_RPI4
+                - _NODE_CLASS_RPI5
+                - _NODE_CLASS_EXTERNAL
+            )
+        )
+    )
+
 def kb_retrieve(query: str, *, limit: int = 3) -> str:
     q = (query or "").strip()
     if not q or not KB.get("runbooks"):
@@ -237,6 +265,12 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
 
 def _extract_titan_nodes(text: str) -> list[str]:
     names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n}
+    for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE):
+        tail = match.group(1)
+        for part in re.split(r"[/,]", tail):
+            part = part.strip()
+            if part:
+                names.add(f"titan-{part.lower()}")
     for match in TITAN_RANGE_RE.finditer(text or ""):
         left, right = match.groups()
         if left:
@@ -245,6 +279,83 @@ def _extract_titan_nodes(text: str) -> list[str]:
             names.add(f"titan-{right.lower()}")
     return sorted(names)
 
+def _parse_node_classes(runbooks: list[dict[str, Any]]) -> dict[str, list[str]]:
+    classes: dict[str, list[str]] = {}
+    for doc in runbooks:
+        if not isinstance(doc, dict):
+            continue
+        body = str(doc.get("body") or "")
+        for line in body.splitlines():
+            stripped = line.strip()
+            if "titan-" not in stripped.lower():
+                continue
+            label = ""
+            nodes: list[str] = []
+            if stripped.startswith("-") and ":" in stripped:
+                label, rest = stripped.lstrip("-").split(":", 1)
+                nodes = _extract_titan_nodes(rest)
+                label = label.strip().lower()
+            else:
+                nodes = _extract_titan_nodes(stripped)
+            if not nodes:
+                continue
+            if "jetson" in stripped.lower():
+                classes.setdefault("jetson", nodes)
+            if "amd64" in stripped.lower() or "x86" in stripped.lower():
+                classes.setdefault("amd64", nodes)
+            if "rpi4" in stripped.lower():
+                classes.setdefault("rpi4", nodes)
+            if "rpi5" in stripped.lower():
+                classes.setdefault("rpi5", nodes)
+            if "external" in stripped.lower() or "non-cluster" in stripped.lower():
+                classes.setdefault("external", nodes)
+            if label:
+                classes.setdefault(label, nodes)
+    return {k: sorted(set(v)) for k, v in classes.items()}
+
+def node_inventory_answer(cluster_name: str, query: str) -> str:
+    q = (query or "").lower()
+    if "jetson" in q and _NODE_CLASS_JETSON:
+        names = sorted(_NODE_CLASS_JETSON)
+        return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
+    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
+        names = sorted(_NODE_CLASS_NON_RPI)
+        if names:
+            return f"{cluster_name} non‑Raspberry Pi nodes: {', '.join(names)}."
+    if "raspberry" in q or "rpi" in q:
+        if "rpi4" in q and _NODE_CLASS_RPI4:
+            names = sorted(_NODE_CLASS_RPI4)
+            return f"{cluster_name} rpi4 nodes: {', '.join(names)}."
+        if "rpi5" in q and _NODE_CLASS_RPI5:
+            names = sorted(_NODE_CLASS_RPI5)
+            return f"{cluster_name} rpi5 nodes: {', '.join(names)}."
+        names = sorted(_NODE_CLASS_RPI4 | _NODE_CLASS_RPI5)
+        if names:
+            return f"{cluster_name} Raspberry Pi nodes: {', '.join(names)}."
+    if ("amd64" in q or "x86" in q) and _NODE_CLASS_AMD64:
+        names = sorted(_NODE_CLASS_AMD64)
+        return f"{cluster_name} amd64 nodes: {', '.join(names)}."
+    return ""
+
+def node_inventory_context(query: str) -> str:
+    q = (query or "").lower()
+    if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "x86", "cluster")):
+        return ""
+    lines: list[str] = ["Node inventory (KB):"]
+    if _NODE_CLASS_RPI5:
+        lines.append(f"- rpi5: {', '.join(sorted(_NODE_CLASS_RPI5))}")
+    if _NODE_CLASS_RPI4:
+        lines.append(f"- rpi4: {', '.join(sorted(_NODE_CLASS_RPI4))}")
+    if _NODE_CLASS_JETSON:
+        lines.append(f"- jetson: {', '.join(sorted(_NODE_CLASS_JETSON))}")
+    if _NODE_CLASS_AMD64:
+        lines.append(f"- amd64: {', '.join(sorted(_NODE_CLASS_AMD64))}")
+    if _NODE_CLASS_EXTERNAL:
+        lines.append(f"- external: {', '.join(sorted(_NODE_CLASS_EXTERNAL))}")
+    if len(lines) == 1:
+        return ""
+    return "\n".join(lines)
+
 def jetson_nodes_from_kb() -> list[str]:
     for doc in KB.get("runbooks", []):
         if not isinstance(doc, dict):
@@ -627,6 +738,10 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st
     if endpoints:
         parts.append(endpoints)
 
+    inventory = node_inventory_context(prompt)
+    if inventory:
+        parts.append(inventory)
+
     if allow_tools:
         # Scope pod summaries to relevant namespaces/workloads when possible.
         prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set)
@@ -656,35 +771,58 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st
 
     return "\n\n".join([p for p in parts if p]).strip()
 
-def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
-    try:
-        system = (
-            "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
-            "Be helpful, direct, and concise. "
-            "Prefer answering with exact repo paths and Kubernetes resource names. "
-            "Never include or request secret values. "
-            "Respond in plain sentences; do not return JSON or code fences unless explicitly asked."
-        )
-        transcript_parts = [system]
-        if context:
-            transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
-        transcript_parts.extend(history[hist_key][-24:])
-        transcript_parts.append(f"User: {prompt}")
-        transcript = "\n".join(transcript_parts)
+def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
+    system = (
+        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
+        "Be helpful, direct, and concise. "
+        "Prefer answering with exact repo paths and Kubernetes resource names. "
+        "Never include or request secret values. "
+        "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
+        "If the answer is not grounded in the provided context or tool data, say you do not know."
+    )
+    transcript_parts = [system]
+    if context:
+        transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
+    transcript_parts.extend(history[hist_key][-24:])
+    transcript_parts.append(f"User: {prompt}")
+    transcript = "\n".join(transcript_parts)
 
-        payload = {"model": MODEL, "message": transcript}
-        headers = {"Content-Type": "application/json"}
-        if API_KEY:
-            headers["x-api-key"] = API_KEY
-        r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-        with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
-            data = json.loads(resp.read().decode())
-            raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
-            reply = _normalize_reply(raw_reply) or "I'm here to help."
-            history[hist_key].append(f"Atlas: {reply}")
-            return reply
+    payload = {"model": MODEL, "message": transcript}
+    headers = {"Content-Type": "application/json"}
+    if API_KEY:
+        headers["x-api-key"] = API_KEY
+    r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
+    with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+        data = json.loads(resp.read().decode())
+        raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
+        reply = _normalize_reply(raw_reply) or "I'm here to help."
+    history[hist_key].append(f"Atlas: {reply}")
+    return reply
+
+def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str:
+    try:
+        return _ollama_call(hist_key, prompt, context=context)
     except Exception:
-        return "I’m here — but I couldn’t reach the model backend."
+        if fallback:
+            history[hist_key].append(f"Atlas: {fallback}")
+            return fallback
+        return "Model backend is busy. Try again in a moment."
+
+def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str:
+    result: dict[str, str] = {"reply": ""}
+    done = threading.Event()
+
+    def worker():
+        result["reply"] = ollama_reply(hist_key, prompt, context=context, fallback=fallback)
+        done.set()
+
+    thread = threading.Thread(target=worker, daemon=True)
+    thread.start()
+    if not done.wait(2.0):
+        send_msg(token, room, "Thinking…")
+        done.wait()
+    thread.join(timeout=1)
+    return result["reply"] or fallback or "Model backend is busy. Try again in a moment."
 
 def sync_loop(token: str, room_id: str):
     since = None
@@ -747,6 +885,10 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
+                inventory_answer = node_inventory_answer("Atlas", lower_body)
+                if inventory_answer:
+                    send_msg(token, rid, inventory_answer)
+                    continue
                 if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")):
                     if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                         arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64"
@@ -760,14 +902,6 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
-                if "jetson" in lower_body:
-                    if any(word in lower_body for word in ("cluster", "atlas", "titan", "node", "nodes")):
-                        summary = jetson_nodes_summary("Atlas")
-                        if summary:
-                            send_msg(token, rid, summary)
-                        else:
-                            send_msg(token, rid, "Jetson inventory is not available in the knowledge base yet.")
-                        continue
                 if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body):
                     if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                         names_summary = nodes_names_summary("Atlas")
@@ -803,7 +937,17 @@ def sync_loop(token: str, room_id: str):
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     context = (context + "\n\n" + extra).strip() if context else extra
-                reply = ollama_reply(hist_key, body, context=context)
+                fallback = ""
+                if "node" in lower_body or "cluster" in lower_body:
+                    fallback = node_inventory_answer("Atlas", lower_body)
+                reply = ollama_reply_with_thinking(
+                    token,
+                    rid,
+                    hist_key,
+                    body,
+                    context=context,
+                    fallback=fallback,
+                )
                 send_msg(token, rid, reply)
 
 def login_with_retry():

From 4f9479c7d568ed50582f17b38d4fe7eae1e8bcf9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 14:08:11 -0300
Subject: [PATCH 239/416] atlasbot: add metrics kb and long timeout

---
 knowledge/catalog/metrics.json                | 1880 +++++++++++++++++
 scripts/knowledge_render_atlas.py             |   65 +
 .../bstein-dev-home/backend-deployment.yaml   |    4 +-
 .../chat-ai-gateway-deployment.yaml           |    2 +
 services/bstein-dev-home/scripts/gateway.py   |    3 +-
 services/comms/atlasbot-deployment.yaml       |    8 +-
 services/comms/knowledge/catalog/metrics.json | 1880 +++++++++++++++++
 services/comms/kustomization.yaml             |    1 +
 services/comms/scripts/atlasbot/bot.py        |   97 +-
 9 files changed, 3934 insertions(+), 6 deletions(-)
 create mode 100644 knowledge/catalog/metrics.json
 create mode 100644 services/comms/knowledge/catalog/metrics.json

diff --git a/knowledge/catalog/metrics.json b/knowledge/catalog/metrics.json
new file mode 100644
index 00000000..e929db58
--- /dev/null
+++ b/knowledge/catalog/metrics.json
@@ -0,0 +1,1880 @@
+[
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 1,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Namespace",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Node",
+    "panel_id": 3,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Top Pods by GPU Util",
+    "panel_id": 4,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (range)",
+    "panel_id": 1,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 3,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Stale (>36h)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Missing Success",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Suspended",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (24h)",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Runs (1h)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Error (hours ago)",
+    "panel_id": 10,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Success (hours ago)",
+    "panel_id": 11,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Success (hours ago)",
+    "panel_id": 12,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Schedule (hours ago)",
+    "panel_id": 13,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 14,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (30d)",
+    "panel_id": 15,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Access Requests",
+    "panel_id": 16,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(ariadne_access_requests_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Coverage (%)",
+    "panel_id": 17,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_coverage_percent{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Tests (latest)",
+    "panel_id": 18,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_tests_total{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (7d)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"7d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Success Rate (1d)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Limit Used (30d)",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Send Limit (30d)",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Last Success",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_last_success_timestamp_seconds)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounce Rate (1d vs 7d)",
+    "panel_id": 13,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounce_rate)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounced (1d vs 7d)",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounced)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d vs 7d)",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_sent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Success Rate (5m)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (1h)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (6h)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Edge P99 Latency (ms)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Traffic",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Egress Traffic",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Intra-Cluster Traffic",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Per-Node Throughput",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Namespaces",
+    "panel_id": 9,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Pods",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Routers (req/s)",
+    "panel_id": 11,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Entrypoints (req/s)",
+    "panel_id": 12,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Worker Nodes Ready",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server 5xx rate",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server P99 latency",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "etcd P99 latency",
+    "panel_id": 11,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node CPU",
+    "panel_id": 4,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node RAM",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) CPU",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) RAM",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Stuck Terminating",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Atlas Availability",
+    "panel_id": 27,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Problem Pods",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Workers Ready",
+    "panel_id": 1,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: CPU",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: RAM",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: NET (rx+tx)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: I/O (r+w)",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Sent (1d)",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Success Rate (1d)",
+    "panel_id": 32,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Limit Used (30d)",
+    "panel_id": 33,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Connections Used",
+    "panel_id": 34,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Hottest Connections",
+    "panel_id": 35,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(1, sum by (datname) (pg_stat_activity_count))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Usage",
+    "panel_id": 23,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Usage",
+    "panel_id": 24,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Free",
+    "panel_id": 25,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Free",
+    "panel_id": 26,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 40,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 41,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Test Success Rate",
+    "panel_id": 42,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Tests with Failures (24h)",
+    "panel_id": 43,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace CPU Share",
+    "panel_id": 11,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 12,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace RAM Share",
+    "panel_id": 13,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node CPU",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node RAM",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane CPU",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane RAM",
+    "panel_id": 17,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Node Pod Share",
+    "panel_id": 28,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 29,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Ingress Throughput",
+    "panel_id": 18,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Egress Throughput",
+    "panel_id": 19,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Intra-Cluster Throughput",
+    "panel_id": 20,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 21,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Nodes Closest to Full Root Disks",
+    "panel_id": 22,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Problem Pods",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Stuck Terminating (>10m)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Pods Not Running",
+    "panel_id": 5,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Terminating >10m",
+    "panel_id": 7,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Node Pod Share",
+    "panel_id": 8,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 9,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Namespace Plurality by Node v27",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Free",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Free",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Per-Node Usage",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Per-Node Usage",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage History",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage History",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Sweepers Ready",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Cron Freshness (s)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})"
+    ]
+  }
+]
diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py
index 206dcd90..1e305cbb 100644
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@@ -26,6 +26,7 @@ from typing import Any, Iterable
 import yaml
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
+DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"
 
 CLUSTER_SCOPED_KINDS = {
     "Namespace",
@@ -67,6 +68,64 @@ def _sync_tree(source: Path, dest: Path) -> None:
     shutil.copytree(source, dest)
 
 
+def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
+    panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
+    for panel in panels:
+        if not isinstance(panel, dict):
+            continue
+        if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
+            yield from _iter_dashboard_panels({"panels": panel.get("panels")})
+            continue
+        yield panel
+
+
+def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
+    index: list[dict[str, Any]] = []
+    for path in sorted(dashboard_dir.glob("*.json")):
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(data, dict):
+            continue
+        dash_title = data.get("title") or path.stem
+        dash_tags = data.get("tags") or []
+        for panel in _iter_dashboard_panels(data):
+            targets = panel.get("targets")
+            if not isinstance(targets, list):
+                continue
+            exprs: list[str] = []
+            for target in targets:
+                if not isinstance(target, dict):
+                    continue
+                expr = target.get("expr")
+                if isinstance(expr, str) and expr.strip():
+                    exprs.append(expr.strip())
+            if not exprs:
+                continue
+            datasource = panel.get("datasource") or {}
+            if isinstance(datasource, dict):
+                ds_uid = datasource.get("uid")
+                ds_type = datasource.get("type")
+            else:
+                ds_uid = None
+                ds_type = None
+            index.append(
+                {
+                    "dashboard": dash_title,
+                    "panel_title": panel.get("title") or "",
+                    "panel_id": panel.get("id"),
+                    "panel_type": panel.get("type"),
+                    "description": panel.get("description") or "",
+                    "tags": dash_tags,
+                    "datasource_uid": ds_uid,
+                    "datasource_type": ds_type,
+                    "exprs": exprs,
+                }
+            )
+    return index
+
+
 def kustomize_build(path: Path) -> str:
     rel = path.relative_to(REPO_ROOT)
     try:
@@ -516,6 +575,7 @@ def main() -> int:
     summary_path = out_dir / "catalog" / "atlas-summary.json"
     diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
     runbooks_json_path = out_dir / "catalog" / "runbooks.json"
+    metrics_json_path = out_dir / "catalog" / "metrics.json"
 
     catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
     catalog_path.write_text(
@@ -560,12 +620,17 @@ def main() -> int:
                 }
             )
     runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
+    metrics_index = _extract_metrics_index(DASHBOARD_DIR)
+    metrics_json_path.write_text(
+        json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
+    )
 
     print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
     print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
+    print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
 
     if args.sync_comms:
         comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index 2170396e..ecf478cc 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -58,14 +58,14 @@ spec:
           args:
             - >-
               . /vault/secrets/portal-env.sh
-              && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app
+              && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app
           env:
             - name: AI_CHAT_API
               value: http://ollama.ai.svc.cluster.local:11434
             - name: AI_CHAT_MODEL
               value: qwen2.5-coder:7b-instruct-q4_0
             - name: AI_CHAT_TIMEOUT_SEC
-              value: "60"
+              value: "480"
             - name: AI_NODE_NAME
               valueFrom:
                 fieldRef:
diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
index 40d74fe1..7209da62 100644
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@@ -47,6 +47,8 @@ spec:
           env:
             - name: UPSTREAM_URL
               value: http://bstein-dev-home-backend/api/chat
+            - name: UPSTREAM_TIMEOUT_SEC
+              value: "600"
           ports:
             - name: http
               containerPort: 8080
diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py
index 3ca2fa16..19d36062 100644
--- a/services/bstein-dev-home/scripts/gateway.py
+++ b/services/bstein-dev-home/scripts/gateway.py
@@ -6,6 +6,7 @@ from urllib import request, error
 UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
 KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
 KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
+UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90"))
 
 ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
 
@@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler):
                 headers={"Content-Type": "application/json"},
                 method="POST",
             )
-            with request.urlopen(upstream_req, timeout=90) as resp:
+            with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp:
                 data = resp.read()
                 self.send_response(resp.status)
                 for k, v in resp.headers.items():
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 2c08853d..031abb8d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-12
+        checksum/atlasbot-configmap: manual-atlasbot-13
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -83,6 +83,10 @@ spec:
               value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
               value: qwen2.5-coder:7b-instruct-q4_0
+            - name: OLLAMA_TIMEOUT_SEC
+              value: "480"
+            - name: ATLASBOT_THINKING_INTERVAL_SEC
+              value: "120"
           resources:
             requests:
               cpu: 100m
@@ -114,6 +118,8 @@ spec:
                 path: catalog/atlas.json
               - key: atlas-summary.json
                 path: catalog/atlas-summary.json
+              - key: metrics.json
+                path: catalog/metrics.json
               - key: runbooks.json
                 path: catalog/runbooks.json
               - key: atlas-http.mmd
diff --git a/services/comms/knowledge/catalog/metrics.json b/services/comms/knowledge/catalog/metrics.json
new file mode 100644
index 00000000..e929db58
--- /dev/null
+++ b/services/comms/knowledge/catalog/metrics.json
@@ -0,0 +1,1880 @@
+[
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 1,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Namespace",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "GPU Util by Node",
+    "panel_id": 3,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas GPU",
+    "panel_title": "Top Pods by GPU Util",
+    "panel_id": 4,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "gpu"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (range)",
+    "panel_id": 1,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 2,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 3,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Stale (>36h)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Missing Success",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Suspended",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (24h)",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Runs (1h)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[1h]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Error (hours ago)",
+    "panel_id": 10,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Schedule Last Success (hours ago)",
+    "panel_id": 11,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Success (hours ago)",
+    "panel_id": 12,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Glue Jobs Last Schedule (hours ago)",
+    "panel_id": 13,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (1h)",
+    "panel_id": 14,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Task Errors (30d)",
+    "panel_id": 15,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne Access Requests",
+    "panel_id": 16,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(ariadne_access_requests_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Coverage (%)",
+    "panel_id": 17,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_coverage_percent{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Jobs",
+    "panel_title": "Ariadne CI Tests (latest)",
+    "panel_id": 18,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "jobs",
+      "glue"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "ariadne_ci_tests_total{repo=\"ariadne\"}"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (7d)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"7d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Success Rate (1d)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Limit Used (30d)",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Send Limit (30d)",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Last Success",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_last_success_timestamp_seconds)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounce Rate (1d vs 7d)",
+    "panel_id": 13,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounce_rate)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Bounced (1d vs 7d)",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_bounced)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Sent (1d vs 7d)",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max by (window) (postmark_outbound_sent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Mail",
+    "panel_title": "Exporter Errors",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "mail"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(postmark_request_errors_total)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Success Rate (5m)",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (1h)",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Error Budget Burn (6h)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Edge P99 Latency (ms)",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Ingress Traffic",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Egress Traffic",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Intra-Cluster Traffic",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Per-Node Throughput",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Namespaces",
+    "panel_id": 9,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Top Pods",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Routers (req/s)",
+    "panel_id": 11,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Network",
+    "panel_title": "Traefik Entrypoints (req/s)",
+    "panel_id": 12,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "network"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Worker Nodes Ready",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server 5xx rate",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "API Server P99 latency",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "etcd P99 latency",
+    "panel_id": 11,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node CPU",
+    "panel_id": 4,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Node RAM",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) CPU",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Control Plane (incl. titan-db) RAM",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Nodes",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "nodes"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Ready",
+    "panel_id": 2,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Stuck Terminating",
+    "panel_id": 5,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Atlas Availability",
+    "panel_id": 27,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Problem Pods",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Workers Ready",
+    "panel_id": 1,
+    "panel_type": "gauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: CPU",
+    "panel_id": 7,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: RAM",
+    "panel_id": 8,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: NET (rx+tx)",
+    "panel_id": 9,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Hottest node: I/O (r+w)",
+    "panel_id": 10,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Sent (1d)",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_sent{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Bounces (1d)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_outbound_bounce_rate{window=\"1d\"})",
+      "max(postmark_outbound_bounced{window=\"1d\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Success Rate (1d)",
+    "panel_id": 32,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Mail Limit Used (30d)",
+    "panel_id": 33,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "max(postmark_sending_limit_used_percent)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Connections Used",
+    "panel_id": 34,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Postgres Hottest Connections",
+    "panel_id": 35,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "topk(1, sum by (datname) (pg_stat_activity_count))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Usage",
+    "panel_id": 23,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Usage",
+    "panel_id": 24,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Astreae Free",
+    "panel_id": 25,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Asteria Free",
+    "panel_id": 26,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "One-off Job Pods (age hours)",
+    "panel_id": 40,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Attempts / Failures",
+    "panel_id": 41,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(increase(ariadne_task_runs_total[$__interval]))",
+      "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Ariadne Test Success Rate",
+    "panel_id": 42,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Tests with Failures (24h)",
+    "panel_id": 43,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace CPU Share",
+    "panel_id": 11,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace GPU Share",
+    "panel_id": 12,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Namespace RAM Share",
+    "panel_id": 13,
+    "panel_type": "piechart",
+    "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node CPU",
+    "panel_id": 14,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Worker Node RAM",
+    "panel_id": 15,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane CPU",
+    "panel_id": 16,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Control plane RAM",
+    "panel_id": 17,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Node Pod Share",
+    "panel_id": 28,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 29,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Ingress Throughput",
+    "panel_id": 18,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Cluster Egress Throughput",
+    "panel_id": 19,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Intra-Cluster Throughput",
+    "panel_id": 20,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Root Filesystem Usage",
+    "panel_id": 21,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Overview",
+    "panel_title": "Nodes Closest to Full Root Disks",
+    "panel_id": 22,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "overview"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Problem Pods",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Stuck Terminating (>10m)",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Control Plane Workloads",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Pods Not Running",
+    "panel_id": 5,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "CrashLoop / ImagePull",
+    "panel_id": 6,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Terminating >10m",
+    "panel_id": 7,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Node Pod Share",
+    "panel_id": 8,
+    "panel_type": "piechart",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Top Nodes by Pod Count",
+    "panel_id": 9,
+    "panel_type": "bargauge",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Pods",
+    "panel_title": "Namespace Plurality by Node v27",
+    "panel_id": 10,
+    "panel_type": "table",
+    "description": "",
+    "tags": [
+      "atlas",
+      "pods"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage",
+    "panel_id": 1,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage",
+    "panel_id": 2,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Free",
+    "panel_id": 3,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Free",
+    "panel_id": 4,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Per-Node Usage",
+    "panel_id": 5,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Per-Node Usage",
+    "panel_id": 6,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Astreae Usage History",
+    "panel_id": 7,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Asteria Usage History",
+    "panel_id": 8,
+    "panel_type": "timeseries",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Sweepers Ready",
+    "panel_id": 30,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100"
+    ]
+  },
+  {
+    "dashboard": "Atlas Storage",
+    "panel_title": "Maintenance Cron Freshness (s)",
+    "panel_id": 31,
+    "panel_type": "stat",
+    "description": "",
+    "tags": [
+      "atlas",
+      "storage"
+    ],
+    "datasource_uid": "atlas-vm",
+    "datasource_type": "prometheus",
+    "exprs": [
+      "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})"
+    ]
+  }
+]
diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml
index 33600676..37f681de 100644
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@@ -73,5 +73,6 @@ configMapGenerator:
       - INDEX.md=knowledge/INDEX.md
       - atlas.json=knowledge/catalog/atlas.json
       - atlas-summary.json=knowledge/catalog/atlas-summary.json
+      - metrics.json=knowledge/catalog/metrics.json
       - runbooks.json=knowledge/catalog/runbooks.json
       - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 8edc28dc..e604e65f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -17,7 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev"
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
-OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90"))
+OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
 
 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
@@ -29,6 +29,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
 MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
+THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
 
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE)
 HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)")
@@ -59,8 +60,21 @@ STOPWORDS = {
 }
 
 METRIC_HINT_WORDS = {
+    "bandwidth",
+    "connections",
+    "cpu",
+    "database",
+    "db",
+    "disk",
     "health",
+    "memory",
+    "network",
+    "node",
+    "nodes",
+    "postgres",
     "status",
+    "storage",
+    "usage",
     "down",
     "slow",
     "error",
@@ -157,6 +171,7 @@ def send_msg(token: str, room: str, text: str):
 KB = {"catalog": {}, "runbooks": []}
 _HOST_INDEX: dict[str, list[dict]] = {}
 _NAME_INDEX: set[str] = set()
+_METRIC_INDEX: list[dict[str, Any]] = []
 _NODE_CLASS_INDEX: dict[str, list[str]] = {}
 _NODE_CLASS_RPI4: set[str] = set()
 _NODE_CLASS_RPI5: set[str] = set()
@@ -180,6 +195,7 @@ def load_kb():
         return
     catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {}
     runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or []
+    metrics = _load_json_file(os.path.join(KB_DIR, "catalog", "metrics.json")) or []
     KB = {"catalog": catalog, "runbooks": runbooks}
 
     host_index: dict[str, list[dict]] = collections.defaultdict(list)
@@ -197,6 +213,7 @@ def load_kb():
         if isinstance(w, dict) and w.get("name"):
             names.add(str(w["name"]).lower())
     _NAME_INDEX = names
+    _METRIC_INDEX = metrics if isinstance(metrics, list) else []
 
     node_classes = _parse_node_classes(runbooks)
     _NODE_CLASS_INDEX = node_classes
@@ -356,6 +373,65 @@ def node_inventory_context(query: str) -> str:
         return ""
     return "\n".join(lines)
 
+def _metric_tokens(entry: dict[str, Any]) -> str:
+    parts: list[str] = []
+    for key in ("panel_title", "dashboard", "description"):
+        val = entry.get(key)
+        if isinstance(val, str) and val:
+            parts.append(val.lower())
+    tags = entry.get("tags")
+    if isinstance(tags, list):
+        parts.extend(str(t).lower() for t in tags if t)
+    return " ".join(parts)
+
+def metrics_lookup(query: str, limit: int = 3) -> list[dict[str, Any]]:
+    q_tokens = _tokens(query)
+    if not q_tokens or not _METRIC_INDEX:
+        return []
+    scored: list[tuple[int, dict[str, Any]]] = []
+    for entry in _METRIC_INDEX:
+        if not isinstance(entry, dict):
+            continue
+        hay = _metric_tokens(entry)
+        if not hay:
+            continue
+        score = 0
+        for t in set(q_tokens):
+            if t in hay:
+                score += 2 if t in (entry.get("panel_title") or "").lower() else 1
+        if score:
+            scored.append((score, entry))
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return [entry for _, entry in scored[:limit]]
+
+def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
+    if not allow_tools:
+        return "", ""
+    lower = (prompt or "").lower()
+    if not any(word in lower for word in METRIC_HINT_WORDS):
+        return "", ""
+    matches = metrics_lookup(prompt, limit=1)
+    if not matches:
+        return "", ""
+    entry = matches[0]
+    exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
+    if not exprs:
+        return "", ""
+    rendered_parts: list[str] = []
+    for expr in exprs[:2]:
+        res = vm_query(expr, timeout=20)
+        rendered = vm_render_result(res, limit=10)
+        if rendered:
+            rendered_parts.append(rendered)
+    if not rendered_parts:
+        return "", ""
+    dashboard = entry.get("dashboard") or "dashboard"
+    panel = entry.get("panel_title") or "panel"
+    summary = "\n".join(rendered_parts)
+    context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
+    fallback = f"{panel}: {summary}"
+    return context, fallback
+
 def jetson_nodes_from_kb() -> list[str]:
     for doc in KB.get("runbooks", []):
         if not isinstance(doc, dict):
@@ -777,6 +853,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         "Be helpful, direct, and concise. "
         "Prefer answering with exact repo paths and Kubernetes resource names. "
         "Never include or request secret values. "
+        "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
         "If the answer is not grounded in the provided context or tool data, say you do not know."
     )
@@ -820,7 +897,17 @@ def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *,
     thread.start()
     if not done.wait(2.0):
         send_msg(token, room, "Thinking…")
-        done.wait()
+        prompt_hint = " ".join((prompt or "").split())
+        if len(prompt_hint) > 160:
+            prompt_hint = prompt_hint[:157] + "…"
+        heartbeat = max(10, THINKING_INTERVAL_SEC)
+        next_heartbeat = time.monotonic() + heartbeat
+        while not done.wait(max(0, next_heartbeat - time.monotonic())):
+            if prompt_hint:
+                send_msg(token, room, f"Still thinking about: {prompt_hint} (gathering context)")
+            else:
+                send_msg(token, room, "Still thinking (gathering context)…")
+            next_heartbeat += heartbeat
     thread.join(timeout=1)
     return result["reply"] or fallback or "Model backend is busy. Try again in a moment."
 
@@ -937,9 +1024,15 @@ def sync_loop(token: str, room_id: str):
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     context = (context + "\n\n" + extra).strip() if context else extra
+                metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools)
+                if metrics_context:
+                    context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
+
                 fallback = ""
                 if "node" in lower_body or "cluster" in lower_body:
                     fallback = node_inventory_answer("Atlas", lower_body)
+                if metrics_fallback and not fallback:
+                    fallback = metrics_fallback
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From 349688529409927eddef4d30a35131a7573b0136 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 15:34:52 -0300
Subject: [PATCH 240/416] atlasbot: load metrics index and answer in rooms

---
 services/comms/scripts/atlasbot/bot.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e604e65f..ff9019e7 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -188,7 +188,7 @@ def _load_json_file(path: str) -> Any | None:
         return None
 
 def load_kb():
-    global KB, _HOST_INDEX, _NAME_INDEX
+    global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX
     global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON
     global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI
     if not KB_DIR:
@@ -414,6 +414,8 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
     if not matches:
         return "", ""
     entry = matches[0]
+    dashboard = entry.get("dashboard") or "dashboard"
+    panel = entry.get("panel_title") or "panel"
     exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
     if not exprs:
         return "", ""
@@ -424,9 +426,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
         if rendered:
             rendered_parts.append(rendered)
     if not rendered_parts:
-        return "", ""
-    dashboard = entry.get("dashboard") or "dashboard"
-    panel = entry.get("panel_title") or "panel"
+        return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
     summary = "\n".join(rendered_parts)
     context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
     fallback = f"{panel}: {summary}"
@@ -998,8 +998,9 @@ def sync_loop(token: str, room_id: str):
                         send_msg(token, rid, names_summary)
                         continue
 
-                # Only do live cluster/metrics introspection in DMs.
+                # Only do live cluster introspection in DMs; metrics can be answered when mentioned.
                 allow_tools = is_dm
+                allow_metrics = is_dm or mentioned
 
                 promql = ""
                 if allow_tools:
@@ -1024,7 +1025,7 @@ def sync_loop(token: str, room_id: str):
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     context = (context + "\n\n" + extra).strip() if context else extra
-                metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools)
+                metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
                 if metrics_context:
                     context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
 

From b92bd79c98e41ad8853f46e037a86e223330f8e9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 15:54:00 -0300
Subject: [PATCH 241/416] atlasbot: recognize prefix mentions

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 031abb8d..aa91dcb1 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -78,7 +78,7 @@ spec:
             - name: BOT_USER
               value: atlasbot
             - name: BOT_MENTIONS
-              value: atlasbot
+              value: atlasbot,aatlasbot
             - name: OLLAMA_URL
               value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index ff9019e7..f4182cd4 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -119,9 +119,21 @@ def normalize_user_id(token: str) -> str:
 
 MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)}
 
+def _body_mentions_token(body: str) -> bool:
+    lower = (body or "").strip().lower()
+    if not lower:
+        return False
+    for token in MENTION_LOCALPARTS:
+        for prefix in (token, f"@{token}"):
+            if lower.startswith(prefix + ":") or lower.startswith(prefix + ",") or lower.startswith(prefix + " "):
+                return True
+    return False
+
 def is_mentioned(content: dict, body: str) -> bool:
     if MENTION_RE.search(body or "") is not None:
         return True
+    if _body_mentions_token(body or ""):
+        return True
     mentions = content.get("m.mentions", {})
     user_ids = mentions.get("user_ids", [])
     if not isinstance(user_ids, list):

From 2398e287537b8c8da7c6e11ec7a1b09c3e1060f9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 18:16:14 -0300
Subject: [PATCH 242/416] atlasbot: improve worker readiness and metrics
 replies

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 140 +++++++++++++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index aa91dcb1..d5d8f06f 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-13
+        checksum/atlasbot-configmap: manual-atlasbot-14
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index f4182cd4..57549b37 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -441,7 +441,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
         return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
     summary = "\n".join(rendered_parts)
     context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
-    fallback = f"{panel}: {summary}"
+    fallback = _metrics_fallback_summary(panel, summary)
     return context, fallback
 
 def jetson_nodes_from_kb() -> list[str]:
@@ -654,6 +654,115 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str:
         out.append(f"- {labels}: {val}")
     return "\n".join(out)
 
+def _parse_metric_lines(summary: str) -> dict[str, str]:
+    parsed: dict[str, str] = {}
+    for line in (summary or "").splitlines():
+        line = line.strip()
+        if not line.startswith("-"):
+            continue
+        try:
+            label, value = line.lstrip("-").split(":", 1)
+        except ValueError:
+            continue
+        parsed[label.strip()] = value.strip()
+    return parsed
+
+def _metrics_fallback_summary(panel: str, summary: str) -> str:
+    parsed = _parse_metric_lines(summary)
+    panel_l = (panel or "").lower()
+    if panel_l.startswith("postgres connections"):
+        used = parsed.get("conn=used")
+        maxv = parsed.get("conn=max")
+        if used and maxv:
+            try:
+                used_i = int(float(used))
+                max_i = int(float(maxv))
+            except ValueError:
+                return f"Postgres connections: {summary}"
+            free = max_i - used_i
+            return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
+    if panel_l.startswith("postgres hottest"):
+        if parsed:
+            label, value = next(iter(parsed.items()))
+            return f"Most Postgres connections: {label} = {value}."
+    return f"{panel}: {summary}"
+
+def _node_ready_status(node: dict) -> bool | None:
+    conditions = node.get("status", {}).get("conditions") or []
+    for cond in conditions if isinstance(conditions, list) else []:
+        if cond.get("type") == "Ready":
+            if cond.get("status") == "True":
+                return True
+            if cond.get("status") == "False":
+                return False
+            return None
+    return None
+
+def _node_is_worker(node: dict) -> bool:
+    labels = (node.get("metadata") or {}).get("labels") or {}
+    if labels.get("node-role.kubernetes.io/control-plane") is not None:
+        return False
+    if labels.get("node-role.kubernetes.io/master") is not None:
+        return False
+    if labels.get("node-role.kubernetes.io/worker") is not None:
+        return True
+    return True
+
+def worker_nodes_status() -> tuple[list[str], list[str]]:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return ([], [])
+    items = data.get("items") or []
+    ready_nodes: list[str] = []
+    not_ready_nodes: list[str] = []
+    for node in items if isinstance(items, list) else []:
+        if not _node_is_worker(node):
+            continue
+        name = (node.get("metadata") or {}).get("name") or ""
+        if not name:
+            continue
+        ready = _node_ready_status(node)
+        if ready is True:
+            ready_nodes.append(name)
+        elif ready is False:
+            not_ready_nodes.append(name)
+    return (sorted(ready_nodes), sorted(not_ready_nodes))
+
+def expected_nodes_from_kb() -> set[str]:
+    if not _NODE_CLASS_INDEX:
+        return set()
+    nodes = set().union(*_NODE_CLASS_INDEX.values())
+    return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
+
+def missing_nodes_answer(cluster_name: str) -> str:
+    expected = expected_nodes_from_kb()
+    if not expected:
+        return ""
+    current = set()
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+        items = data.get("items") or []
+        for node in items if isinstance(items, list) else []:
+            name = (node.get("metadata") or {}).get("name") or ""
+            if name:
+                current.add(name)
+    except Exception:
+        return ""
+    missing = sorted(expected - current)
+    if not missing:
+        return f"{cluster_name}: no missing nodes versus KB inventory."
+    return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}."
+
+def _should_short_circuit(prompt: str, fallback: str) -> bool:
+    if not fallback:
+        return False
+    lower = (prompt or "").lower()
+    for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"):
+        if word in lower:
+            return False
+    return True
+
 def vm_top_restarts(hours: int = 1) -> str:
     q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
     res = vm_query(q)
@@ -984,6 +1093,32 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
+                if "worker" in lower_body and "node" in lower_body:
+                    ready_nodes, not_ready_nodes = worker_nodes_status()
+                    total = len(ready_nodes) + len(not_ready_nodes)
+                    if total:
+                        if any(word in lower_body for word in ("ready", "not ready", "unready")):
+                            if not_ready_nodes:
+                                send_msg(
+                                    token,
+                                    rid,
+                                    f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.",
+                                )
+                            else:
+                                send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.")
+                            continue
+                        if any(word in lower_body for word in ("how many", "should")):
+                            send_msg(
+                                token,
+                                rid,
+                                f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.",
+                            )
+                            continue
+                if "missing" in lower_body and "node" in lower_body:
+                    missing = missing_nodes_answer("Atlas")
+                    if missing:
+                        send_msg(token, rid, missing)
+                        continue
                 inventory_answer = node_inventory_answer("Atlas", lower_body)
                 if inventory_answer:
                     send_msg(token, rid, inventory_answer)
@@ -1046,6 +1181,9 @@ def sync_loop(token: str, room_id: str):
                     fallback = node_inventory_answer("Atlas", lower_body)
                 if metrics_fallback and not fallback:
                     fallback = metrics_fallback
+                if _should_short_circuit(body, fallback):
+                    send_msg(token, rid, fallback)
+                    continue
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From afa5d3bd2928dfce7aea05239c95a57f45795e55 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 18:18:42 -0300
Subject: [PATCH 243/416] atlasbot: improve worker node answers

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 16 +++++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d5d8f06f..69aef2f8 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-14
+        checksum/atlasbot-configmap: manual-atlasbot-15
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 57549b37..3b9082d8 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1097,6 +1097,7 @@ def sync_loop(token: str, room_id: str):
                     ready_nodes, not_ready_nodes = worker_nodes_status()
                     total = len(ready_nodes) + len(not_ready_nodes)
                     if total:
+                        missing_hint = missing_nodes_answer("Atlas")
                         if any(word in lower_body for word in ("ready", "not ready", "unready")):
                             if not_ready_nodes:
                                 send_msg(
@@ -1105,14 +1106,19 @@ def sync_loop(token: str, room_id: str):
                                     f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.",
                                 )
                             else:
-                                send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.")
+                                msg = f"All {len(ready_nodes)} worker nodes are Ready."
+                                if missing_hint and "no missing" not in missing_hint:
+                                    msg += f" {missing_hint}"
+                                send_msg(token, rid, msg)
                             continue
                         if any(word in lower_body for word in ("how many", "should")):
-                            send_msg(
-                                token,
-                                rid,
-                                f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.",
+                            msg = (
+                                f"Atlas has {total} worker nodes; "
+                                f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
                             )
+                            if missing_hint and "no missing" not in missing_hint:
+                                msg += f" {missing_hint}"
+                            send_msg(token, rid, msg)
                             continue
                 if "missing" in lower_body and "node" in lower_body:
                     missing = missing_nodes_answer("Atlas")

From 70a095f5d06f0a4ffc05d391a48a6ef5ae77886c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 18:21:17 -0300
Subject: [PATCH 244/416] atlasbot: clarify worker count limits

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 69aef2f8..802021f8 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-15
+        checksum/atlasbot-configmap: manual-atlasbot-16
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 3b9082d8..71537238 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1118,6 +1118,8 @@ def sync_loop(token: str, room_id: str):
                             )
                             if missing_hint and "no missing" not in missing_hint:
                                 msg += f" {missing_hint}"
+                            elif "should" in lower_body:
+                                msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."
                             send_msg(token, rid, msg)
                             continue
                 if "missing" in lower_body and "node" in lower_body:

From dce37f403512bac057c9c3427f486ff72cb7741d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 18:50:23 -0300
Subject: [PATCH 245/416] atlasbot: infer worker expected count from metrics

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 33 +++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 802021f8..b7843abd 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-16
+        checksum/atlasbot-configmap: manual-atlasbot-17
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 71537238..bd40a9f9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -191,6 +191,7 @@ _NODE_CLASS_AMD64: set[str] = set()
 _NODE_CLASS_JETSON: set[str] = set()
 _NODE_CLASS_EXTERNAL: set[str] = set()
 _NODE_CLASS_NON_RPI: set[str] = set()
+NODE_REGEX = re.compile(r'node=~"([^"]+)"')
 
 def _load_json_file(path: str) -> Any | None:
     try:
@@ -735,6 +736,23 @@ def expected_nodes_from_kb() -> set[str]:
     nodes = set().union(*_NODE_CLASS_INDEX.values())
     return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
 
+def expected_worker_nodes_from_metrics() -> list[str]:
+    for entry in _METRIC_INDEX:
+        panel = (entry.get("panel_title") or "").lower()
+        if "worker nodes ready" not in panel:
+            continue
+        exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else []
+        for expr in exprs:
+            if not isinstance(expr, str):
+                continue
+            match = NODE_REGEX.search(expr)
+            if not match:
+                continue
+            raw = match.group(1)
+            nodes = [n.strip() for n in raw.split("|") if n.strip()]
+            return sorted(nodes)
+    return []
+
 def missing_nodes_answer(cluster_name: str) -> str:
     expected = expected_nodes_from_kb()
     if not expected:
@@ -1098,6 +1116,8 @@ def sync_loop(token: str, room_id: str):
                     total = len(ready_nodes) + len(not_ready_nodes)
                     if total:
                         missing_hint = missing_nodes_answer("Atlas")
+                        expected_workers = expected_worker_nodes_from_metrics()
+                        expected_total = len(expected_workers) if expected_workers else 0
                         if any(word in lower_body for word in ("ready", "not ready", "unready")):
                             if not_ready_nodes:
                                 send_msg(
@@ -1107,7 +1127,11 @@ def sync_loop(token: str, room_id: str):
                                 )
                             else:
                                 msg = f"All {len(ready_nodes)} worker nodes are Ready."
-                                if missing_hint and "no missing" not in missing_hint:
+                                if expected_total and len(ready_nodes) != expected_total:
+                                    missing = sorted(set(expected_workers) - set(ready_nodes))
+                                    if missing:
+                                        msg += f" Missing: {', '.join(missing)}."
+                                elif missing_hint and "no missing" not in missing_hint:
                                     msg += f" {missing_hint}"
                                 send_msg(token, rid, msg)
                             continue
@@ -1116,7 +1140,12 @@ def sync_loop(token: str, room_id: str):
                                 f"Atlas has {total} worker nodes; "
                                 f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
                             )
-                            if missing_hint and "no missing" not in missing_hint:
+                            if expected_total:
+                                msg += f" Grafana inventory expects {expected_total} workers."
+                                missing = sorted(set(expected_workers) - set(ready_nodes))
+                                if missing:
+                                    msg += f" Missing: {', '.join(missing)}."
+                            elif missing_hint and "no missing" not in missing_hint:
                                 msg += f" {missing_hint}"
                             elif "should" in lower_body:
                                 msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."

From 7cefb603e10232f77f8e97d410a8063d94d72a32 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:01:26 -0300
Subject: [PATCH 246/416] atlasbot: improve missing node inference

---
 services/comms/scripts/atlasbot/bot.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index bd40a9f9..7eb6dc77 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -754,6 +754,15 @@ def expected_worker_nodes_from_metrics() -> list[str]:
     return []
 
 def missing_nodes_answer(cluster_name: str) -> str:
+    expected_workers = expected_worker_nodes_from_metrics()
+    if expected_workers:
+        ready_nodes, not_ready_nodes = worker_nodes_status()
+        current_workers = set(ready_nodes + not_ready_nodes)
+        missing = sorted(set(expected_workers) - current_workers)
+        if not missing:
+            return f"{cluster_name}: no missing worker nodes versus Grafana inventory."
+        return f"{cluster_name} missing worker nodes versus Grafana inventory: {', '.join(missing)}."
+
     expected = expected_nodes_from_kb()
     if not expected:
         return ""
@@ -1173,7 +1182,7 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, summary)
                         continue
-                if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body):
+                if re.search(r"\bnode names?\b|\bnodes?\b.*\bnamed\b|\bnaming\b", lower_body):
                     if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                         names_summary = nodes_names_summary("Atlas")
                         if not names_summary:
@@ -1181,6 +1190,14 @@ def sync_loop(token: str, room_id: str):
                             continue
                         send_msg(token, rid, names_summary)
                         continue
+                if re.search(r"\bwhich nodes are ready\b|\bnodes ready\b", lower_body):
+                    ready_nodes, not_ready_nodes = worker_nodes_status()
+                    if ready_nodes:
+                        msg = f"Ready worker nodes ({len(ready_nodes)}): {', '.join(ready_nodes)}."
+                        if not_ready_nodes:
+                            msg += f" Not Ready: {', '.join(not_ready_nodes)}."
+                        send_msg(token, rid, msg)
+                        continue
 
                 # Only do live cluster introspection in DMs; metrics can be answered when mentioned.
                 allow_tools = is_dm

From eae4521a44f74b8cdbf766aecc9b1270dbf80165 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:02:54 -0300
Subject: [PATCH 247/416] atlasbot: roll deployment

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index b7843abd..e45d9f3d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-17
+        checksum/atlasbot-configmap: manual-atlasbot-18
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From f7a73dd9e3bce1b8dfe4dc1f0b986f7af042fa76 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:22:28 -0300
Subject: [PATCH 248/416] atlasbot: use live node inventory context

---
 services/comms/scripts/atlasbot/bot.py | 320 +++++++------------------
 1 file changed, 89 insertions(+), 231 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 7eb6dc77..e070eadd 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -184,13 +184,6 @@ KB = {"catalog": {}, "runbooks": []}
 _HOST_INDEX: dict[str, list[dict]] = {}
 _NAME_INDEX: set[str] = set()
 _METRIC_INDEX: list[dict[str, Any]] = []
-_NODE_CLASS_INDEX: dict[str, list[str]] = {}
-_NODE_CLASS_RPI4: set[str] = set()
-_NODE_CLASS_RPI5: set[str] = set()
-_NODE_CLASS_AMD64: set[str] = set()
-_NODE_CLASS_JETSON: set[str] = set()
-_NODE_CLASS_EXTERNAL: set[str] = set()
-_NODE_CLASS_NON_RPI: set[str] = set()
 NODE_REGEX = re.compile(r'node=~"([^"]+)"')
 
 def _load_json_file(path: str) -> Any | None:
@@ -202,8 +195,6 @@ def _load_json_file(path: str) -> Any | None:
 
 def load_kb():
     global KB, _HOST_INDEX, _NAME_INDEX, _METRIC_INDEX
-    global _NODE_CLASS_INDEX, _NODE_CLASS_RPI4, _NODE_CLASS_RPI5, _NODE_CLASS_AMD64, _NODE_CLASS_JETSON
-    global _NODE_CLASS_EXTERNAL, _NODE_CLASS_NON_RPI
     if not KB_DIR:
         return
     catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {}
@@ -228,24 +219,6 @@ def load_kb():
     _NAME_INDEX = names
     _METRIC_INDEX = metrics if isinstance(metrics, list) else []
 
-    node_classes = _parse_node_classes(runbooks)
-    _NODE_CLASS_INDEX = node_classes
-    _NODE_CLASS_RPI4 = set(node_classes.get("rpi4", []))
-    _NODE_CLASS_RPI5 = set(node_classes.get("rpi5", []))
-    _NODE_CLASS_AMD64 = set(node_classes.get("amd64", []))
-    _NODE_CLASS_JETSON = set(node_classes.get("jetson", []))
-    _NODE_CLASS_EXTERNAL = set(node_classes.get("external", []))
-    _NODE_CLASS_NON_RPI = set(
-        sorted(
-            (
-                set().union(*node_classes.values())
-                - _NODE_CLASS_RPI4
-                - _NODE_CLASS_RPI5
-                - _NODE_CLASS_EXTERNAL
-            )
-        )
-    )
-
 def kb_retrieve(query: str, *, limit: int = 3) -> str:
     q = (query or "").strip()
     if not q or not KB.get("runbooks"):
@@ -309,81 +282,92 @@ def _extract_titan_nodes(text: str) -> list[str]:
             names.add(f"titan-{right.lower()}")
     return sorted(names)
 
-def _parse_node_classes(runbooks: list[dict[str, Any]]) -> dict[str, list[str]]:
-    classes: dict[str, list[str]] = {}
-    for doc in runbooks:
-        if not isinstance(doc, dict):
-            continue
-        body = str(doc.get("body") or "")
-        for line in body.splitlines():
-            stripped = line.strip()
-            if "titan-" not in stripped.lower():
-                continue
-            label = ""
-            nodes: list[str] = []
-            if stripped.startswith("-") and ":" in stripped:
-                label, rest = stripped.lstrip("-").split(":", 1)
-                nodes = _extract_titan_nodes(rest)
-                label = label.strip().lower()
-            else:
-                nodes = _extract_titan_nodes(stripped)
-            if not nodes:
-                continue
-            if "jetson" in stripped.lower():
-                classes.setdefault("jetson", nodes)
-            if "amd64" in stripped.lower() or "x86" in stripped.lower():
-                classes.setdefault("amd64", nodes)
-            if "rpi4" in stripped.lower():
-                classes.setdefault("rpi4", nodes)
-            if "rpi5" in stripped.lower():
-                classes.setdefault("rpi5", nodes)
-            if "external" in stripped.lower() or "non-cluster" in stripped.lower():
-                classes.setdefault("external", nodes)
-            if label:
-                classes.setdefault(label, nodes)
-    return {k: sorted(set(v)) for k, v in classes.items()}
+def _node_roles(labels: dict[str, Any]) -> list[str]:
+    roles: list[str] = []
+    for key in labels.keys():
+        if key.startswith("node-role.kubernetes.io/"):
+            role = key.split("/", 1)[-1]
+            if role:
+                roles.append(role)
+    return sorted(set(roles))
 
-def node_inventory_answer(cluster_name: str, query: str) -> str:
-    q = (query or "").lower()
-    if "jetson" in q and _NODE_CLASS_JETSON:
-        names = sorted(_NODE_CLASS_JETSON)
-        return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
-    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
-        names = sorted(_NODE_CLASS_NON_RPI)
-        if names:
-            return f"{cluster_name} non‑Raspberry Pi nodes: {', '.join(names)}."
-    if "raspberry" in q or "rpi" in q:
-        if "rpi4" in q and _NODE_CLASS_RPI4:
-            names = sorted(_NODE_CLASS_RPI4)
-            return f"{cluster_name} rpi4 nodes: {', '.join(names)}."
-        if "rpi5" in q and _NODE_CLASS_RPI5:
-            names = sorted(_NODE_CLASS_RPI5)
-            return f"{cluster_name} rpi5 nodes: {', '.join(names)}."
-        names = sorted(_NODE_CLASS_RPI4 | _NODE_CLASS_RPI5)
-        if names:
-            return f"{cluster_name} Raspberry Pi nodes: {', '.join(names)}."
-    if ("amd64" in q or "x86" in q) and _NODE_CLASS_AMD64:
-        names = sorted(_NODE_CLASS_AMD64)
-        return f"{cluster_name} amd64 nodes: {', '.join(names)}."
-    return ""
+def _hardware_class(labels: dict[str, Any]) -> str:
+    if str(labels.get("jetson") or "").lower() == "true":
+        return "jetson"
+    hardware = (labels.get("hardware") or "").strip().lower()
+    if hardware in ("rpi4", "rpi5"):
+        return hardware
+    arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or ""
+    if arch == "amd64":
+        return "amd64"
+    if arch == "arm64":
+        return "arm64-unknown"
+    return "unknown"
+
+def node_inventory_live() -> list[dict[str, Any]]:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return []
+    items = data.get("items") or []
+    inventory: list[dict[str, Any]] = []
+    for node in items if isinstance(items, list) else []:
+        meta = node.get("metadata") or {}
+        labels = meta.get("labels") or {}
+        name = meta.get("name") or ""
+        if not name:
+            continue
+        inventory.append(
+            {
+                "name": name,
+                "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
+                "hardware": _hardware_class(labels),
+                "roles": _node_roles(labels),
+                "ready": _node_ready_status(node),
+            }
+        )
+    return sorted(inventory, key=lambda item: item["name"])
+
+def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
+    grouped: dict[str, list[str]] = collections.defaultdict(list)
+    for node in inventory:
+        grouped[node.get("hardware") or "unknown"].append(node["name"])
+    return {k: sorted(v) for k, v in grouped.items()}
 
 def node_inventory_context(query: str) -> str:
     q = (query or "").lower()
-    if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "x86", "cluster")):
+    if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
         return ""
-    lines: list[str] = ["Node inventory (KB):"]
-    if _NODE_CLASS_RPI5:
-        lines.append(f"- rpi5: {', '.join(sorted(_NODE_CLASS_RPI5))}")
-    if _NODE_CLASS_RPI4:
-        lines.append(f"- rpi4: {', '.join(sorted(_NODE_CLASS_RPI4))}")
-    if _NODE_CLASS_JETSON:
-        lines.append(f"- jetson: {', '.join(sorted(_NODE_CLASS_JETSON))}")
-    if _NODE_CLASS_AMD64:
-        lines.append(f"- amd64: {', '.join(sorted(_NODE_CLASS_AMD64))}")
-    if _NODE_CLASS_EXTERNAL:
-        lines.append(f"- external: {', '.join(sorted(_NODE_CLASS_EXTERNAL))}")
-    if len(lines) == 1:
+    inventory = node_inventory_live()
+    if not inventory:
         return ""
+    groups = _group_nodes(inventory)
+    total = len(inventory)
+    ready = sum(1 for node in inventory if node.get("ready") is True)
+    not_ready = sum(1 for node in inventory if node.get("ready") is False)
+    lines: list[str] = [
+        "Node inventory (live):",
+        f"- total: {total}, ready: {ready}, not ready: {not_ready}",
+    ]
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        if key in groups:
+            lines.append(f"- {key}: {', '.join(groups[key])}")
+    non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
+    if non_rpi:
+        lines.append(f"- non_raspberry_pi (derived): {', '.join(non_rpi)}")
+    unknowns = groups.get("arm64-unknown", []) + groups.get("unknown", [])
+    if unknowns:
+        lines.append("- note: nodes labeled arm64-unknown/unknown may still be Raspberry Pi unless tagged.")
+    expected_workers = expected_worker_nodes_from_metrics()
+    if expected_workers:
+        ready_workers, not_ready_workers = worker_nodes_status()
+        missing = sorted(set(expected_workers) - set(ready_workers + not_ready_workers))
+        lines.append(f"- expected_workers (grafana): {', '.join(expected_workers)}")
+        lines.append(f"- workers_ready: {', '.join(ready_workers)}")
+        if not_ready_workers:
+            lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
+        if missing:
+            lines.append(f"- workers_missing (derived): {', '.join(missing)}")
     return "\n".join(lines)
 
 def _metric_tokens(entry: dict[str, Any]) -> str:
@@ -730,12 +714,6 @@ def worker_nodes_status() -> tuple[list[str], list[str]]:
             not_ready_nodes.append(name)
     return (sorted(ready_nodes), sorted(not_ready_nodes))
 
-def expected_nodes_from_kb() -> set[str]:
-    if not _NODE_CLASS_INDEX:
-        return set()
-    nodes = set().union(*_NODE_CLASS_INDEX.values())
-    return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
-
 def expected_worker_nodes_from_metrics() -> list[str]:
     for entry in _METRIC_INDEX:
         panel = (entry.get("panel_title") or "").lower()
@@ -753,42 +731,13 @@ def expected_worker_nodes_from_metrics() -> list[str]:
             return sorted(nodes)
     return []
 
-def missing_nodes_answer(cluster_name: str) -> str:
-    expected_workers = expected_worker_nodes_from_metrics()
-    if expected_workers:
-        ready_nodes, not_ready_nodes = worker_nodes_status()
-        current_workers = set(ready_nodes + not_ready_nodes)
-        missing = sorted(set(expected_workers) - current_workers)
-        if not missing:
-            return f"{cluster_name}: no missing worker nodes versus Grafana inventory."
-        return f"{cluster_name} missing worker nodes versus Grafana inventory: {', '.join(missing)}."
-
-    expected = expected_nodes_from_kb()
-    if not expected:
+def _context_fallback(context: str) -> str:
+    if not context:
         return ""
-    current = set()
-    try:
-        data = k8s_get("/api/v1/nodes?limit=500")
-        items = data.get("items") or []
-        for node in items if isinstance(items, list) else []:
-            name = (node.get("metadata") or {}).get("name") or ""
-            if name:
-                current.add(name)
-    except Exception:
-        return ""
-    missing = sorted(expected - current)
-    if not missing:
-        return f"{cluster_name}: no missing nodes versus KB inventory."
-    return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}."
-
-def _should_short_circuit(prompt: str, fallback: str) -> bool:
-    if not fallback:
-        return False
-    lower = (prompt or "").lower()
-    for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"):
-        if word in lower:
-            return False
-    return True
+    trimmed = context.strip()
+    if len(trimmed) > MAX_TOOL_CHARS:
+        trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..."
+    return "I couldn’t reach the model backend. Here is the data I found:\n" + trimmed
 
 def vm_top_restarts(hours: int = 1) -> str:
     q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
@@ -1112,92 +1061,6 @@ def sync_loop(token: str, room_id: str):
                     continue
 
                 lower_body = body.lower()
-                if re.search(r"\bhow many nodes\b|\bnode count\b|\bnumber of nodes\b", lower_body):
-                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
-                        summary = nodes_summary("Atlas")
-                        if not summary:
-                            send_msg(token, rid, "I couldn’t reach the cluster API to count nodes. Try again in a moment.")
-                            continue
-                        send_msg(token, rid, summary)
-                        continue
-                if "worker" in lower_body and "node" in lower_body:
-                    ready_nodes, not_ready_nodes = worker_nodes_status()
-                    total = len(ready_nodes) + len(not_ready_nodes)
-                    if total:
-                        missing_hint = missing_nodes_answer("Atlas")
-                        expected_workers = expected_worker_nodes_from_metrics()
-                        expected_total = len(expected_workers) if expected_workers else 0
-                        if any(word in lower_body for word in ("ready", "not ready", "unready")):
-                            if not_ready_nodes:
-                                send_msg(
-                                    token,
-                                    rid,
-                                    f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.",
-                                )
-                            else:
-                                msg = f"All {len(ready_nodes)} worker nodes are Ready."
-                                if expected_total and len(ready_nodes) != expected_total:
-                                    missing = sorted(set(expected_workers) - set(ready_nodes))
-                                    if missing:
-                                        msg += f" Missing: {', '.join(missing)}."
-                                elif missing_hint and "no missing" not in missing_hint:
-                                    msg += f" {missing_hint}"
-                                send_msg(token, rid, msg)
-                            continue
-                        if any(word in lower_body for word in ("how many", "should")):
-                            msg = (
-                                f"Atlas has {total} worker nodes; "
-                                f"{len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady."
-                            )
-                            if expected_total:
-                                msg += f" Grafana inventory expects {expected_total} workers."
-                                missing = sorted(set(expected_workers) - set(ready_nodes))
-                                if missing:
-                                    msg += f" Missing: {', '.join(missing)}."
-                            elif missing_hint and "no missing" not in missing_hint:
-                                msg += f" {missing_hint}"
-                            elif "should" in lower_body:
-                                msg += " I don’t have an expected worker inventory in the KB; this is the current cluster state."
-                            send_msg(token, rid, msg)
-                            continue
-                if "missing" in lower_body and "node" in lower_body:
-                    missing = missing_nodes_answer("Atlas")
-                    if missing:
-                        send_msg(token, rid, missing)
-                        continue
-                inventory_answer = node_inventory_answer("Atlas", lower_body)
-                if inventory_answer:
-                    send_msg(token, rid, inventory_answer)
-                    continue
-                if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")):
-                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
-                        arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64"
-                        summary = nodes_arch_summary("Atlas", arch)
-                        if not summary:
-                            send_msg(
-                                token,
-                                rid,
-                                "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.",
-                            )
-                            continue
-                        send_msg(token, rid, summary)
-                        continue
-                if re.search(r"\bnode names?\b|\bnodes?\b.*\bnamed\b|\bnaming\b", lower_body):
-                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
-                        names_summary = nodes_names_summary("Atlas")
-                        if not names_summary:
-                            send_msg(token, rid, "I couldn’t reach the cluster API to list node names. Try again in a moment.")
-                            continue
-                        send_msg(token, rid, names_summary)
-                        continue
-                if re.search(r"\bwhich nodes are ready\b|\bnodes ready\b", lower_body):
-                    ready_nodes, not_ready_nodes = worker_nodes_status()
-                    if ready_nodes:
-                        msg = f"Ready worker nodes ({len(ready_nodes)}): {', '.join(ready_nodes)}."
-                        if not_ready_nodes:
-                            msg += f" Not Ready: {', '.join(not_ready_nodes)}."
-                        send_msg(token, rid, msg)
-                        continue
 
                 # Only do live cluster introspection in DMs; metrics can be answered when mentioned.
                 allow_tools = is_dm
@@ -1230,14 +1093,9 @@ def sync_loop(token: str, room_id: str):
                 if metrics_context:
                     context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
 
-                fallback = ""
-                if "node" in lower_body or "cluster" in lower_body:
-                    fallback = node_inventory_answer("Atlas", lower_body)
-                if metrics_fallback and not fallback:
-                    fallback = metrics_fallback
-                if _should_short_circuit(body, fallback):
-                    send_msg(token, rid, fallback)
-                    continue
+                fallback = metrics_fallback or ""
+                if not fallback and context:
+                    fallback = _context_fallback(context)
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From e8ff0a5c223d7685edea42e0bcb7bc718d6e9002 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:24:03 -0300
Subject: [PATCH 249/416] atlasbot: reload for live inventory

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index e45d9f3d..4d5b31cc 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-18
+        checksum/atlasbot-configmap: manual-atlasbot-19
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From fa08bd34f36fdde4727ec22b82728d20285d3ba1 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:29:26 -0300
Subject: [PATCH 250/416] atlasbot: answer from live inventory

---
 services/comms/scripts/atlasbot/bot.py | 123 +++++++++++++++++++++++--
 1 file changed, 116 insertions(+), 7 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e070eadd..6fc654bd 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -334,11 +334,12 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
         grouped[node.get("hardware") or "unknown"].append(node["name"])
     return {k: sorted(v) for k, v in grouped.items()}
 
-def node_inventory_context(query: str) -> str:
+def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str:
     q = (query or "").lower()
     if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
         return ""
-    inventory = node_inventory_live()
+    if inventory is None:
+        inventory = node_inventory_live()
     if not inventory:
         return ""
     groups = _group_nodes(inventory)
@@ -370,6 +371,101 @@ def node_inventory_context(query: str) -> str:
             lines.append(f"- workers_missing (derived): {', '.join(missing)}")
     return "\n".join(lines)
 
+def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
+    q = (prompt or "").lower()
+    if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
+        return node_inventory_live()
+    return []
+
+def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
+    names = [node["name"] for node in inventory]
+    ready = [node["name"] for node in inventory if node.get("ready") is True]
+    not_ready = [node["name"] for node in inventory if node.get("ready") is False]
+    groups = _group_nodes(inventory)
+    return {
+        "names": sorted(names),
+        "ready": sorted(ready),
+        "not_ready": sorted(not_ready),
+        "groups": groups,
+    }
+
+def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
+    q = (prompt or "").lower()
+    if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
+        return metrics_summary
+
+    if not inventory:
+        return ""
+
+    sets = _inventory_sets(inventory)
+    names = sets["names"]
+    ready = sets["ready"]
+    not_ready = sets["not_ready"]
+    groups = sets["groups"]
+    total = len(names)
+
+    for node in _extract_titan_nodes(q):
+        if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q):
+            if node in names:
+                return f"Yes. {node} is in the Atlas cluster."
+            return f"No. {node} is not in the Atlas cluster."
+
+    if any(word in q for word in ("how many", "count", "number")) and "node" in q and "worker" not in q:
+        return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
+
+    if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
+        return "Atlas node names: " + ", ".join(names) + "."
+
+    if "ready" in q and "node" in q and "worker" in q:
+        if "not ready" in q or "unready" in q or "down" in q:
+            return "Worker nodes not ready: " + (", ".join(not_ready) if not_ready else "none") + "."
+        return "Ready worker nodes ({}): {}.".format(len(ready), ", ".join(ready))
+
+    if "worker" in q and any(word in q for word in ("missing", "expected", "should")):
+        expected_workers = expected_worker_nodes_from_metrics()
+        missing = sorted(set(expected_workers) - set(ready + not_ready)) if expected_workers else []
+        if "missing" in q and missing:
+            return "Missing worker nodes: " + ", ".join(missing) + "."
+        if expected_workers:
+            msg = f"Grafana inventory expects {len(expected_workers)} workers."
+            if missing:
+                msg += f" Missing: {', '.join(missing)}."
+            return msg
+        return "No expected worker inventory found; using live cluster state."
+
+    if "worker" in q and "node" in q and "ready" not in q and "missing" not in q:
+        return f"Worker nodes: {len(ready)} ready, {len(not_ready)} not ready."
+
+    if "jetson" in q:
+        jets = groups.get("jetson", [])
+        return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
+
+    if "amd64" in q or "x86" in q:
+        amd = groups.get("amd64", [])
+        return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
+
+    if "rpi4" in q:
+        rpi4 = groups.get("rpi4", [])
+        return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found."
+
+    if "rpi5" in q:
+        rpi5 = groups.get("rpi5", [])
+        return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
+
+    if "raspberry" in q or "rpi" in q:
+        rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])))
+        return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
+
+    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
+        non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
+        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found."
+
+    if "arm64-unknown" in q or "unknown" in q:
+        unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])))
+        return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
+
+    return ""
+
 def _metric_tokens(entry: dict[str, Any]) -> str:
     parts: list[str] = []
     for key in ("panel_title", "dashboard", "description"):
@@ -900,7 +996,13 @@ history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (
 def key_for(room_id: str, sender: str, is_dm: bool):
     return (room_id, None) if is_dm else (room_id, sender)
 
-def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str:
+def build_context(
+    prompt: str,
+    *,
+    allow_tools: bool,
+    targets: list[tuple[str, str]],
+    inventory: list[dict[str, Any]] | None = None,
+) -> str:
     parts: list[str] = []
 
     kb = kb_retrieve(prompt)
@@ -911,9 +1013,9 @@ def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, st
     if endpoints:
         parts.append(endpoints)
 
-    inventory = node_inventory_context(prompt)
-    if inventory:
-        parts.append(inventory)
+    node_ctx = node_inventory_context(prompt, inventory)
+    if node_ctx:
+        parts.append(node_ctx)
 
     if allow_tools:
         # Scope pod summaries to relevant namespaces/workloads when possible.
@@ -1083,7 +1185,8 @@ def sync_loop(token: str, room_id: str):
                             if isinstance(w, dict) and w.get("name"):
                                 targets.append((ns, str(w["name"])))
 
-                context = build_context(body, allow_tools=allow_tools, targets=targets)
+                inventory = node_inventory_for_prompt(body)
+                context = build_context(body, allow_tools=allow_tools, targets=targets, inventory=inventory)
                 if allow_tools and promql:
                     res = vm_query(promql, timeout=20)
                     rendered = vm_render_result(res, limit=15) or "(no results)"
@@ -1096,6 +1199,12 @@ def sync_loop(token: str, room_id: str):
                 fallback = metrics_fallback or ""
                 if not fallback and context:
                     fallback = _context_fallback(context)
+
+                structured = structured_answer(body, inventory=inventory, metrics_summary=metrics_fallback or "")
+                if structured:
+                    send_msg(token, rid, structured)
+                    continue
+
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From f09035e900a79f11c20742aea4bc25e0c8cd8aff Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:31:07 -0300
Subject: [PATCH 251/416] atlasbot: reload inventory answers

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 4d5b31cc..57705ecc 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-19
+        checksum/atlasbot-configmap: manual-atlasbot-20
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From a7ff5093da485007184eaadce6afc165486a0c0c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:34:19 -0300
Subject: [PATCH 252/416] atlasbot: generalize inventory answers

---
 services/comms/scripts/atlasbot/bot.py | 80 ++++++++++++++++----------
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 6fc654bd..d06645a5 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -382,11 +382,18 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
     ready = [node["name"] for node in inventory if node.get("ready") is True]
     not_ready = [node["name"] for node in inventory if node.get("ready") is False]
     groups = _group_nodes(inventory)
+    workers = [node for node in inventory if "worker" in (node.get("roles") or [])]
+    worker_names = [node["name"] for node in workers]
+    worker_ready = [node["name"] for node in workers if node.get("ready") is True]
+    worker_not_ready = [node["name"] for node in workers if node.get("ready") is False]
     return {
         "names": sorted(names),
         "ready": sorted(ready),
         "not_ready": sorted(not_ready),
         "groups": groups,
+        "worker_names": sorted(worker_names),
+        "worker_ready": sorted(worker_ready),
+        "worker_not_ready": sorted(worker_not_ready),
     }
 
 def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
@@ -402,6 +409,9 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
     ready = sets["ready"]
     not_ready = sets["not_ready"]
     groups = sets["groups"]
+    worker_names = sets["worker_names"]
+    worker_ready = sets["worker_ready"]
+    worker_not_ready = sets["worker_not_ready"]
     total = len(names)
 
     for node in _extract_titan_nodes(q):
@@ -410,31 +420,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
                 return f"Yes. {node} is in the Atlas cluster."
             return f"No. {node} is not in the Atlas cluster."
 
-    if any(word in q for word in ("how many", "count", "number")) and "node" in q and "worker" not in q:
-        return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
-
-    if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
-        return "Atlas node names: " + ", ".join(names) + "."
-
-    if "ready" in q and "node" in q and "worker" in q:
-        if "not ready" in q or "unready" in q or "down" in q:
-            return "Worker nodes not ready: " + (", ".join(not_ready) if not_ready else "none") + "."
-        return "Ready worker nodes ({}): {}.".format(len(ready), ", ".join(ready))
-
-    if "worker" in q and any(word in q for word in ("missing", "expected", "should")):
-        expected_workers = expected_worker_nodes_from_metrics()
-        missing = sorted(set(expected_workers) - set(ready + not_ready)) if expected_workers else []
-        if "missing" in q and missing:
-            return "Missing worker nodes: " + ", ".join(missing) + "."
-        if expected_workers:
-            msg = f"Grafana inventory expects {len(expected_workers)} workers."
-            if missing:
-                msg += f" Missing: {', '.join(missing)}."
-            return msg
-        return "No expected worker inventory found; using live cluster state."
-
-    if "worker" in q and "node" in q and "ready" not in q and "missing" not in q:
-        return f"Worker nodes: {len(ready)} ready, {len(not_ready)} not ready."
+    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
+        non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
+        if "besides" in q:
+            amd = groups.get("amd64", [])
+            return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
+        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found."
 
     if "jetson" in q:
         jets = groups.get("jetson", [])
@@ -446,24 +437,53 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
 
     if "rpi4" in q:
         rpi4 = groups.get("rpi4", [])
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(rpi4)} rpi4 nodes."
         return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found."
 
     if "rpi5" in q:
         rpi5 = groups.get("rpi5", [])
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(rpi5)} rpi5 nodes."
         return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
 
     if "raspberry" in q or "rpi" in q:
         rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])))
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(rpi)} Raspberry Pi nodes."
         return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
 
-    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
-        non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
-        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found."
-
-    if "arm64-unknown" in q or "unknown" in q:
+    if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
         unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])))
         return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
 
+    if "worker" in q and "node" in q:
+        if any(word in q for word in ("missing", "expected", "should")):
+            expected_workers = expected_worker_nodes_from_metrics()
+            missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else []
+            if "missing" in q and missing:
+                return "Missing worker nodes: " + ", ".join(missing) + "."
+            if expected_workers:
+                msg = f"Grafana inventory expects {len(expected_workers)} workers."
+                if missing:
+                    msg += f" Missing: {', '.join(missing)}."
+                return msg
+            return "No expected worker inventory found; using live cluster state."
+        if "not ready" in q or "unready" in q or "down" in q:
+            return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."
+        return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready))
+
+    if any(word in q for word in ("how many", "count", "number")) and "node" in q:
+        return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
+
+    if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
+        return "Atlas node names: " + ", ".join(names) + "."
+
+    if "ready" in q and "node" in q:
+        return f"Ready nodes ({len(ready)}): {', '.join(ready)}."
+
     return ""
 
 def _metric_tokens(entry: dict[str, Any]) -> str:

From d372bc10fbda956719aed002bae046013c2e1b46 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:34:42 -0300
Subject: [PATCH 253/416] atlasbot: reload structured answers

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 57705ecc..c723d22e 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-20
+        checksum/atlasbot-configmap: manual-atlasbot-21
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 74c37832121201078f6d44cdf9d2c3c5e63fcf57 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 19:53:11 -0300
Subject: [PATCH 254/416] atlasbot: improve node inventory reasoning

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 154 +++++++++++++++++++-----
 2 files changed, 122 insertions(+), 34 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index c723d22e..7cc66b32 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-21
+        checksum/atlasbot-configmap: manual-atlasbot-22
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index d06645a5..6993db25 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -89,9 +89,17 @@ METRIC_HINT_WORDS = {
     "latency",
 }
 
-CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL)
-TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE)
-TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE)
+CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
+TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
+TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
+_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
+
+def normalize_query(text: str) -> str:
+    cleaned = (text or "").lower()
+    for ch in _DASH_CHARS:
+        cleaned = cleaned.replace(ch, "-")
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned
 
 def _tokens(text: str) -> list[str]:
     toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
@@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
     return "\n".join(parts).strip()
 
 def _extract_titan_nodes(text: str) -> list[str]:
-    names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n}
-    for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE):
+    cleaned = normalize_query(text)
+    names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n}
+    for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE):
         tail = match.group(1)
         for part in re.split(r"[/,]", tail):
             part = part.strip()
             if part:
                 names.add(f"titan-{part.lower()}")
-    for match in TITAN_RANGE_RE.finditer(text or ""):
+    for match in TITAN_RANGE_RE.finditer(cleaned):
         left, right = match.groups()
         if left:
             names.add(f"titan-{left.lower()}")
@@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]:
                 "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
                 "hardware": _hardware_class(labels),
                 "roles": _node_roles(labels),
+                "is_worker": _node_is_worker(node),
                 "ready": _node_ready_status(node),
             }
         )
@@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
     return {k: sorted(v) for k, v in grouped.items()}
 
 def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str:
-    q = (query or "").lower()
+    q = normalize_query(query)
     if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
         return ""
     if inventory is None:
@@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None =
     return "\n".join(lines)
 
 def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
-    q = (prompt or "").lower()
+    q = normalize_query(prompt)
     if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
         return node_inventory_live()
     return []
@@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
     ready = [node["name"] for node in inventory if node.get("ready") is True]
     not_ready = [node["name"] for node in inventory if node.get("ready") is False]
     groups = _group_nodes(inventory)
-    workers = [node for node in inventory if "worker" in (node.get("roles") or [])]
+    workers = [node for node in inventory if node.get("is_worker") is True]
     worker_names = [node["name"] for node in workers]
     worker_ready = [node["name"] for node in workers if node.get("ready") is True]
     worker_not_ready = [node["name"] for node in workers if node.get("ready") is False]
+    expected_workers = expected_worker_nodes_from_metrics()
+    expected_ready = [n for n in expected_workers if n in ready] if expected_workers else []
+    expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else []
+    expected_missing = [n for n in expected_workers if n not in names] if expected_workers else []
     return {
         "names": sorted(names),
         "ready": sorted(ready),
@@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
         "worker_names": sorted(worker_names),
         "worker_ready": sorted(worker_ready),
         "worker_not_ready": sorted(worker_not_ready),
+        "expected_workers": expected_workers,
+        "expected_ready": sorted(expected_ready),
+        "expected_not_ready": sorted(expected_not_ready),
+        "expected_missing": sorted(expected_missing),
     }
 
 def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
-    q = (prompt or "").lower()
+    q = normalize_query(prompt)
     if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
         return metrics_summary
 
@@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
     worker_names = sets["worker_names"]
     worker_ready = sets["worker_ready"]
     worker_not_ready = sets["worker_not_ready"]
+    expected_workers = sets["expected_workers"]
+    expected_ready = sets["expected_ready"]
+    expected_not_ready = sets["expected_not_ready"]
+    expected_missing = sets["expected_missing"]
     total = len(names)
+    nodes_in_query = _extract_titan_nodes(q)
+    rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
+    non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
+    unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
 
-    for node in _extract_titan_nodes(q):
-        if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q):
+    if nodes_in_query and ("raspberry" in q or "rpi" in q):
+        parts: list[str] = []
+        for node in nodes_in_query:
+            if node in rpi_nodes:
+                parts.append(f"{node} is a Raspberry Pi node.")
+            elif node in non_rpi:
+                parts.append(f"{node} is not a Raspberry Pi node.")
+            elif node in names:
+                parts.append(f"{node} is in Atlas but hardware is unknown.")
+            else:
+                parts.append(f"{node} is not in the Atlas cluster.")
+        return " ".join(parts)
+
+    if nodes_in_query and "jetson" in q:
+        jets = set(groups.get("jetson", []))
+        parts = []
+        for node in nodes_in_query:
+            if node in jets:
+                parts.append(f"{node} is a Jetson node.")
+            elif node in names:
+                parts.append(f"{node} is not a Jetson node.")
+            else:
+                parts.append(f"{node} is not in the Atlas cluster.")
+        return " ".join(parts)
+
+    if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
+        parts: list[str] = []
+        for node in nodes_in_query:
             if node in names:
-                return f"Yes. {node} is in the Atlas cluster."
-            return f"No. {node} is not in the Atlas cluster."
+                parts.append(f"Yes. {node} is in the Atlas cluster.")
+            else:
+                parts.append(f"No. {node} is not in the Atlas cluster.")
+        return " ".join(parts)
 
-    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
-        non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
-        if "besides" in q:
-            amd = groups.get("amd64", [])
+    if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
+        non_rpi_sorted = sorted(non_rpi)
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
+        if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
+            amd = sorted(groups.get("amd64", []))
             return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
-        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found."
+        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."
 
     if "jetson" in q:
         jets = groups.get("jetson", [])
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(jets)} Jetson nodes."
         return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
 
     if "amd64" in q or "x86" in q:
         amd = groups.get("amd64", [])
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(amd)} amd64 nodes."
         return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
 
+    if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
+        count = sum(1 for node in inventory if node.get("arch") == "arm64")
+        return f"Atlas has {count} arm64 nodes."
+
     if "rpi4" in q:
         rpi4 = groups.get("rpi4", [])
         if any(word in q for word in ("how many", "count", "number")):
@@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
         return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
 
     if "raspberry" in q or "rpi" in q:
-        rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])))
+        rpi = sorted(rpi_nodes)
         if any(word in q for word in ("how many", "count", "number")):
             return f"Atlas has {len(rpi)} Raspberry Pi nodes."
         return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
 
     if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
-        unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])))
+        unknown = sorted(unknown_hw)
         return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
 
-    if "worker" in q and "node" in q:
-        if any(word in q for word in ("missing", "expected", "should")):
-            expected_workers = expected_worker_nodes_from_metrics()
-            missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else []
-            if "missing" in q and missing:
-                return "Missing worker nodes: " + ", ".join(missing) + "."
-            if expected_workers:
-                msg = f"Grafana inventory expects {len(expected_workers)} workers."
-                if missing:
-                    msg += f" Missing: {', '.join(missing)}."
+    if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
+        return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
+
+    if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
+        not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
+        if expected_workers:
+            if "missing" in q:
+                return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
+            if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
+                return (
+                    f"Expected workers: {len(expected_ready)} ready, "
+                    f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
+                )
+            if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
+                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+                if expected_missing:
+                    msg += f" Missing: {', '.join(expected_missing)}."
                 return msg
-            return "No expected worker inventory found; using live cluster state."
-        if "not ready" in q or "unready" in q or "down" in q:
+            if not_ready_query:
+                if expected_not_ready or expected_missing:
+                    detail = []
+                    if expected_not_ready:
+                        detail.append(f"Not ready: {', '.join(expected_not_ready)}")
+                    if expected_missing:
+                        detail.append(f"Missing: {', '.join(expected_missing)}")
+                    return "Worker nodes needing attention. " + " ".join(detail) + "."
+                return "All expected worker nodes are Ready."
+            if any(word in q for word in ("expected", "expect", "should")):
+                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+                if expected_missing:
+                    msg += f" Missing: {', '.join(expected_missing)}."
+                return msg
+            if any(word in q for word in ("how many", "count", "number")):
+                return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
+            if "ready" in q:
+                return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
+        if not_ready_query:
             return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
         if any(word in q for word in ("how many", "count", "number")):
             return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."

From f7d4425740fec1d957651e5a51b8a1d47364f0e0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 20:54:33 -0300
Subject: [PATCH 255/416] ariadne: reduce comms noise, fix gpu labels

---
 scripts/dashboards_render_atlas.py                  | 4 ++--
 services/comms/mas-local-users-ensure-job.yaml      | 2 +-
 services/comms/synapse-seeder-admin-ensure-job.yaml | 2 +-
 services/maintenance/ariadne-deployment.yaml        | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 2 +-
 services/monitoring/dcgm-exporter.yaml              | 2 ++
 services/monitoring/grafana-dashboard-overview.yaml | 2 +-
 7 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 11479d9d..5aa77dc1 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -364,9 +364,9 @@ ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
 ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
 ARIADNE_TEST_SUCCESS_RATE = (
     "100 * "
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[1h])) '
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
     "/ clamp_min("
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[1h])), 1)'
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
 )
 ARIADNE_TEST_FAILURES_24H = (
     'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml
index 5802009a..c8cf5f04 100644
--- a/services/comms/mas-local-users-ensure-job.yaml
+++ b/services/comms/mas-local-users-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-15
+  name: mas-local-users-ensure-16
   namespace: comms
 spec:
   backoffLimit: 1
diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml
index 99056586..ce8ccd35 100644
--- a/services/comms/synapse-seeder-admin-ensure-job.yaml
+++ b/services/comms/synapse-seeder-admin-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-seeder-admin-ensure-7
+  name: synapse-seeder-admin-ensure-8
   namespace: comms
 spec:
   backoffLimit: 2
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 0356e060..33b8a12a 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -306,7 +306,7 @@ spec:
             - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
               value: "*/5 * * * *"
             - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
-              value: "*/30 * * * *"
+              value: "0 0 1 * *"
             - name: ARIADNE_SCHEDULE_COMMS_RESET_ROOM
               value: "0 0 1 1 *"
             - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 2d7f3e51..486cd611 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1690,7 +1690,7 @@
       },
       "targets": [
         {
-          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
+          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
           "refId": "A"
         }
       ],
diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 8760c9f0..3e8d1a60 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -50,6 +50,8 @@ spec:
           env:
             - name: DCGM_EXPORTER_KUBERNETES
               value: "true"
+            - name: KUBERNETES_VIRTUAL_GPUS
+              value: "true"
           securityContext:
             privileged: true
           resources:
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 53361345..afc1e1fb 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1699,7 +1699,7 @@ data:
           },
           "targets": [
             {
-              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)",
+              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
               "refId": "A"
             }
           ],

From a68594972e50e36b0f3236be9f1f2d1272efa3b7 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Mon, 26 Jan 2026 23:54:53 +0000
Subject: [PATCH 256/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 3933caf6..2678a46b 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-49 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-50 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 6a5c9fb0e60c9d196d0e62397a8df4bfa5c7ae43 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 20:57:47 -0300
Subject: [PATCH 257/416] monitoring: map dcgm to shared gpu resources

---
 services/monitoring/dcgm-exporter.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml
index 3e8d1a60..ff5aed5a 100644
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@@ -52,6 +52,8 @@ spec:
               value: "true"
             - name: KUBERNETES_VIRTUAL_GPUS
               value: "true"
+            - name: NVIDIA_RESOURCE_NAMES
+              value: nvidia.com/gpu.shared
           securityContext:
             privileged: true
           resources:

From 332c6bb6ba5e9972ff919e871424542598fd69ef Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:13:04 -0300
Subject: [PATCH 258/416] atlasbot: answer hottest node queries via metrics

---
 services/comms/scripts/atlasbot/bot.py | 94 ++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 6993db25..233b25e9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -18,6 +18,8 @@ OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
 OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
+ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090"))
+ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "")
 
 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
@@ -93,6 +95,12 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
 TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
 TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
 _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
+HOTTEST_QUERIES = {
+    "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+    "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+    "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+    "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+}
 
 def normalize_query(text: str) -> str:
     cleaned = (text or "").lower()
@@ -291,6 +299,77 @@ def _extract_titan_nodes(text: str) -> list[str]:
             names.add(f"titan-{right.lower()}")
     return sorted(names)
 
+def _humanize_rate(value: str, *, unit: str) -> str:
+    try:
+        val = float(value)
+    except (TypeError, ValueError):
+        return value
+    if unit == "%":
+        return f"{val:.1f}%"
+    if val >= 1024 * 1024:
+        return f"{val / (1024 * 1024):.2f} MB/s"
+    if val >= 1024:
+        return f"{val / 1024:.2f} KB/s"
+    return f"{val:.2f} B/s"
+
+def _hottest_query(metric: str, node_regex: str | None) -> str:
+    expr = HOTTEST_QUERIES[metric]
+    if node_regex:
+        needle = 'node_uname_info{nodename!=""}'
+        replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
+        return expr.replace(needle, replacement)
+    return expr
+
+def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None:
+    expr = _hottest_query(metric, node_regex)
+    res = vm_query(expr)
+    series = _vm_value_series(res)
+    if not series:
+        return None
+    first = series[0]
+    labels = first.get("metric") or {}
+    value = first.get("value") or []
+    val = value[1] if isinstance(value, list) and len(value) > 1 else ""
+    node = labels.get("node") or labels.get("__name__") or ""
+    if not node:
+        return None
+    return (str(node), str(val))
+
+def _hottest_answer(q: str, *, nodes: list[str] | None) -> str:
+    metric = None
+    assumed_cpu = False
+    if "cpu" in q:
+        metric = "cpu"
+    elif "ram" in q or "memory" in q:
+        metric = "ram"
+    elif "net" in q or "network" in q:
+        metric = "net"
+    elif "io" in q or "disk" in q or "storage" in q:
+        metric = "io"
+    if metric is None:
+        metric = "cpu"
+        assumed_cpu = True
+    if nodes is not None and not nodes:
+        return "No nodes match the requested hardware class."
+
+    node_regex = "|".join(nodes) if nodes else None
+    metrics = [metric]
+    lines: list[str] = []
+    for m in metrics:
+        picked = _vm_hottest(m, node_regex)
+        if not picked:
+            continue
+        node, val = picked
+        unit = "%" if m in ("cpu", "ram") else "B/s"
+        val_str = _humanize_rate(val, unit=unit)
+        label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m]
+        lines.append(f"{label}: {node} ({val_str})")
+    if not lines:
+        return ""
+    label = metric.upper()
+    suffix = " (defaulting to CPU)" if assumed_cpu else ""
+    return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}"
+
 def _node_roles(labels: dict[str, Any]) -> list[str]:
     roles: list[str] = []
     for key in labels.keys():
@@ -440,6 +519,21 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
     non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
     unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
 
+    if "hottest" in q or "hot" in q:
+        filter_nodes: list[str] | None = None
+        if "amd64" in q or "x86" in q:
+            filter_nodes = sorted(groups.get("amd64", []))
+        elif "jetson" in q:
+            filter_nodes = sorted(groups.get("jetson", []))
+        elif "raspberry" in q or "rpi" in q:
+            filter_nodes = sorted(rpi_nodes)
+        elif "arm64" in q:
+            filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])])
+        hottest = _hottest_answer(q, nodes=filter_nodes)
+        if hottest:
+            return hottest
+        return "Unable to determine hottest nodes right now (metrics unavailable)."
+
     if nodes_in_query and ("raspberry" in q or "rpi" in q):
         parts: list[str] = []
         for node in nodes_in_query:

From b5e5507ff04f47d58d80a775abfb2c79cd6b9cb6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:13:53 -0300
Subject: [PATCH 259/416] comms: restart atlasbot for hottest node fix

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7cc66b32..d5ad62eb 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-22
+        checksum/atlasbot-configmap: manual-atlasbot-23
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 6062e266aa496d9f27451d0d702b8861744f5fcd Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:26:13 -0300
Subject: [PATCH 260/416] vault: allow ariadne to use vault-admin role

---
 services/vault/scripts/vault_k8s_auth_configure.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index a956e0e5..21132c79 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -193,8 +193,8 @@ path "kv/data/atlas/shared/*" {
 write_raw_policy "dev-kv" "${dev_kv_policy}"
 log "writing role vault-admin"
 vault_cmd write "auth/kubernetes/role/vault-admin" \
-  bound_service_account_names="vault-admin" \
-  bound_service_account_namespaces="vault" \
+  bound_service_account_names="vault-admin,ariadne" \
+  bound_service_account_namespaces="vault,maintenance" \
   policies="vault-admin" \
   ttl="${role_ttl}"
 

From 995050f544383412a5cb6544c7039d0119c2fded Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:26:24 -0300
Subject: [PATCH 261/416] monitoring: unify jetson gpu metrics

---
 scripts/dashboards_render_atlas.py            | 33 ++++++++++++++++++-
 services/monitoring/dashboards/atlas-gpu.json |  4 +--
 .../monitoring/dashboards/atlas-overview.json |  2 +-
 .../monitoring/grafana-dashboard-gpu.yaml     |  4 +--
 .../grafana-dashboard-overview.yaml           |  2 +-
 .../jetson-tegrastats-exporter.yaml           |  4 +++
 .../scripts/jetson_tegrastats_exporter.py     |  4 ++-
 7 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 5aa77dc1..675fec52 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -208,7 +208,38 @@ def namespace_ram_raw(scope_var):
 
 
 def namespace_gpu_usage_instant(scope_var):
-    return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
+    dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
+    jetson = jetson_gpu_usage_by_namespace(scope_var)
+    merged = (
+        f'label_replace({dcgm}, "source", "dcgm", "", "") '
+        f'or label_replace({jetson}, "source", "jetson", "", "")'
+    )
+    return f"sum by (namespace) ({merged})"
+
+
+def jetson_gpu_util_by_node():
+    return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
+
+
+def jetson_gpu_requests(scope_var):
+    return (
+        "sum by (namespace,node) ("
+        f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
+        "* on(namespace,pod) group_left(node) kube_pod_info "
+        '* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}'
+        ")"
+    )
+
+
+def jetson_gpu_usage_by_namespace(scope_var):
+    requests_by_ns = jetson_gpu_requests(scope_var)
+    total_by_node = f"sum by (node) ({requests_by_ns})"
+    return (
+        "sum by (namespace) ("
+        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
+        f"* on(node) group_left() {jetson_gpu_util_by_node()}"
+        ")"
+    )
 
 
 def namespace_share_expr(resource_expr):
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index af8a1c5b..6b76a5c2 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -89,7 +89,7 @@
       },
       "targets": [
         {
-          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
+          "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 486cd611..04352f93 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1901,7 +1901,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index d7950f2b..46b25cd0 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -98,7 +98,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
+              "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index afc1e1fb..9495647f 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1910,7 +1910,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
+              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index 8788b206..a8354014 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -44,6 +44,10 @@ spec:
           env:
             - name: JETSON_EXPORTER_PORT
               value: "9100"
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
           volumeMounts:
             - name: script
               mountPath: /etc/tegrastats-exporter
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index cd557e7c..c4d3fa2a 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -7,6 +7,7 @@ import threading
 from time import time
 
 PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
+NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
 METRICS = {
     "gr3d_freq_percent": 0.0,
     "gpu_temp_c": 0.0,
@@ -60,9 +61,10 @@ class Handler(http.server.BaseHTTPRequestHandler):
         with LOCK:
             metrics = METRICS.copy()
         out = []
+        label = f'{{node="{NODE_NAME}"}}'
         for k, v in metrics.items():
             out.append(f"# TYPE jetson_{k} gauge")
-            out.append(f"jetson_{k} {v}")
+            out.append(f"jetson_{k}{label} {v}")
         body = "\\n".join(out) + "\\n"
         self.send_response(200)
         self.send_header("Content-Type", "text/plain; version=0.0.4")

From 094b2aede691c792b53dd29294c5ac8b53d6d835 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 01:27:02 +0000
Subject: [PATCH 262/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 2678a46b..7528f6f3 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-50 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-51 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 25046d0c86faf2833027f1e3ac58bf4bd2e6b34b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:38:18 -0300
Subject: [PATCH 263/416] atlasbot: replace targeted handlers with generic
 planner

---
 services/comms/scripts/atlasbot/bot.py | 573 ++++++++++---------------
 1 file changed, 235 insertions(+), 338 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 233b25e9..987df7a1 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -95,11 +95,29 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
 TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
 TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
 _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
-HOTTEST_QUERIES = {
-    "cpu": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
-    "ram": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
-    "net": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
-    "io": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
+
+OPERATION_HINTS = {
+    "count": ("how many", "count", "number", "total"),
+    "list": ("list", "which", "what are", "show", "names"),
+    "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"),
+    "status": ("ready", "not ready", "unready", "down", "missing", "status"),
+}
+
+METRIC_HINTS = {
+    "cpu": ("cpu",),
+    "ram": ("ram", "memory", "mem"),
+    "net": ("net", "network", "bandwidth", "throughput"),
+    "io": ("io", "disk", "storage"),
+    "connections": ("connections", "conn", "postgres", "database", "db"),
+}
+
+HARDWARE_HINTS = {
+    "amd64": ("amd64", "x86", "x86_64", "x86-64"),
+    "jetson": ("jetson",),
+    "rpi4": ("rpi4",),
+    "rpi5": ("rpi5",),
+    "rpi": ("rpi", "raspberry"),
+    "arm64": ("arm64", "aarch64"),
 }
 
 def normalize_query(text: str) -> str:
@@ -312,63 +330,127 @@ def _humanize_rate(value: str, *, unit: str) -> str:
         return f"{val / 1024:.2f} KB/s"
     return f"{val:.2f} B/s"
 
-def _hottest_query(metric: str, node_regex: str | None) -> str:
-    expr = HOTTEST_QUERIES[metric]
-    if node_regex:
-        needle = 'node_uname_info{nodename!=""}'
-        replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
-        return expr.replace(needle, replacement)
-    return expr
+def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
+    return any(p in text for p in phrases)
 
-def _vm_hottest(metric: str, node_regex: str | None) -> tuple[str, str] | None:
-    expr = _hottest_query(metric, node_regex)
-    res = vm_query(expr)
-    series = _vm_value_series(res)
-    if not series:
-        return None
-    first = series[0]
-    labels = first.get("metric") or {}
-    value = first.get("value") or []
-    val = value[1] if isinstance(value, list) and len(value) > 1 else ""
-    node = labels.get("node") or labels.get("__name__") or ""
-    if not node:
-        return None
-    return (str(node), str(val))
+def _detect_operation(q: str) -> str | None:
+    for op, phrases in OPERATION_HINTS.items():
+        if _has_any(q, phrases):
+            return op
+    return None
 
-def _hottest_answer(q: str, *, nodes: list[str] | None) -> str:
-    metric = None
-    assumed_cpu = False
-    if "cpu" in q:
-        metric = "cpu"
-    elif "ram" in q or "memory" in q:
-        metric = "ram"
-    elif "net" in q or "network" in q:
-        metric = "net"
-    elif "io" in q or "disk" in q or "storage" in q:
-        metric = "io"
-    if metric is None:
-        metric = "cpu"
-        assumed_cpu = True
-    if nodes is not None and not nodes:
-        return "No nodes match the requested hardware class."
+def _detect_metric(q: str) -> str | None:
+    for metric, phrases in METRIC_HINTS.items():
+        if _has_any(q, phrases):
+            return metric
+    return None
 
-    node_regex = "|".join(nodes) if nodes else None
-    metrics = [metric]
-    lines: list[str] = []
-    for m in metrics:
-        picked = _vm_hottest(m, node_regex)
-        if not picked:
+def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
+    include: set[str] = set()
+    exclude: set[str] = set()
+    for hardware, phrases in HARDWARE_HINTS.items():
+        for phrase in phrases:
+            if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q:
+                exclude.add(hardware)
+            elif phrase in q:
+                include.add(hardware)
+    return include, exclude
+
+def _detect_entity(q: str) -> str | None:
+    if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
+        return "node"
+    if "pod" in q or "pods" in q:
+        return "pod"
+    if "namespace" in q or "namespaces" in q:
+        return "namespace"
+    return None
+
+def _metric_entry_score(entry: dict[str, Any], tokens: list[str], *, metric: str | None, op: str | None) -> int:
+    hay = _metric_tokens(entry)
+    score = 0
+    for t in set(tokens):
+        if t in hay:
+            score += 2 if t in (entry.get("panel_title") or "").lower() else 1
+    if metric:
+        for phrase in METRIC_HINTS.get(metric, (metric,)):
+            if phrase in hay:
+                score += 3
+    if op == "top" and ("hottest" in hay or "top" in hay):
+        score += 3
+    if "node" in hay:
+        score += 1
+    return score
+
+def _select_metric_entry(tokens: list[str], *, metric: str | None, op: str | None) -> dict[str, Any] | None:
+    scored: list[tuple[int, dict[str, Any]]] = []
+    for entry in _METRIC_INDEX:
+        if not isinstance(entry, dict):
             continue
-        node, val = picked
-        unit = "%" if m in ("cpu", "ram") else "B/s"
-        val_str = _humanize_rate(val, unit=unit)
-        label = {"cpu": "CPU", "ram": "RAM", "net": "NET", "io": "I/O"}[m]
-        lines.append(f"{label}: {node} ({val_str})")
-    if not lines:
+        score = _metric_entry_score(entry, tokens, metric=metric, op=op)
+        if score:
+            scored.append((score, entry))
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return scored[0][1]
+
+def _apply_node_filter(expr: str, node_regex: str | None) -> str:
+    if not node_regex:
+        return expr
+    needle = 'node_uname_info{nodename!=""}'
+    replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
+    return expr.replace(needle, replacement)
+
+def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
+    series = _vm_value_series(res)
+    panel = entry.get("panel_title") or "Metric"
+    if not series:
         return ""
-    label = metric.upper()
-    suffix = " (defaulting to CPU)" if assumed_cpu else ""
-    return f"Hottest node by {label}: {lines[0].split(': ', 1)[1]}.{suffix}"
+    rendered = vm_render_result(res, limit=5)
+    if not rendered:
+        return ""
+    lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")]
+    if len(lines) == 1:
+        return f"{panel}: {lines[0]}."
+    return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines)
+
+def _inventory_filter(
+    inventory: list[dict[str, Any]],
+    *,
+    include_hw: set[str],
+    exclude_hw: set[str],
+    only_workers: bool,
+    only_ready: bool | None,
+    nodes_in_query: list[str],
+) -> list[dict[str, Any]]:
+    results = inventory
+    if nodes_in_query:
+        results = [node for node in results if node.get("name") in nodes_in_query]
+    if only_workers:
+        results = [node for node in results if node.get("is_worker") is True]
+    if only_ready is True:
+        results = [node for node in results if node.get("ready") is True]
+    if only_ready is False:
+        results = [node for node in results if node.get("ready") is False]
+    if include_hw:
+        results = [node for node in results if _hardware_match(node, include_hw)]
+    if exclude_hw:
+        results = [node for node in results if not _hardware_match(node, exclude_hw)]
+    return results
+
+def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool:
+    hw = node.get("hardware") or ""
+    arch = node.get("arch") or ""
+    for f in filters:
+        if f == "rpi" and hw in ("rpi4", "rpi5"):
+            return True
+        if f == "arm64" and arch == "arm64":
+            return True
+        if hw == f:
+            return True
+        if f == "amd64" and arch == "amd64":
+            return True
+    return False
 
 def _node_roles(labels: dict[str, Any]) -> list[str]:
     roles: list[str] = []
@@ -495,176 +577,103 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
 
 def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
     q = normalize_query(prompt)
-    if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
-        return metrics_summary
-
-    if not inventory:
+    if not q:
         return ""
 
-    sets = _inventory_sets(inventory)
-    names = sets["names"]
-    ready = sets["ready"]
-    not_ready = sets["not_ready"]
-    groups = sets["groups"]
-    worker_names = sets["worker_names"]
-    worker_ready = sets["worker_ready"]
-    worker_not_ready = sets["worker_not_ready"]
-    expected_workers = sets["expected_workers"]
-    expected_ready = sets["expected_ready"]
-    expected_not_ready = sets["expected_not_ready"]
-    expected_missing = sets["expected_missing"]
-    total = len(names)
+    tokens = _tokens(q)
+    op = _detect_operation(q)
+    metric = _detect_metric(q)
+    entity = _detect_entity(q)
+    include_hw, exclude_hw = _detect_hardware_filters(q)
     nodes_in_query = _extract_titan_nodes(q)
-    rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
-    non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
-    unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))
+    only_workers = "worker" in q or "workers" in q
+    only_ready: bool | None = None
+    if "not ready" in q or "unready" in q or "down" in q or "missing" in q:
+        only_ready = False
+    elif "ready" in q:
+        only_ready = True
 
-    if "hottest" in q or "hot" in q:
-        filter_nodes: list[str] | None = None
-        if "amd64" in q or "x86" in q:
-            filter_nodes = sorted(groups.get("amd64", []))
-        elif "jetson" in q:
-            filter_nodes = sorted(groups.get("jetson", []))
-        elif "raspberry" in q or "rpi" in q:
-            filter_nodes = sorted(rpi_nodes)
-        elif "arm64" in q:
-            filter_nodes = sorted([n for n in names if n not in groups.get("amd64", [])])
-        hottest = _hottest_answer(q, nodes=filter_nodes)
-        if hottest:
-            return hottest
-        return "Unable to determine hottest nodes right now (metrics unavailable)."
+    if entity == "node" and only_ready is not None and op != "count":
+        op = "status"
 
-    if nodes_in_query and ("raspberry" in q or "rpi" in q):
-        parts: list[str] = []
-        for node in nodes_in_query:
-            if node in rpi_nodes:
-                parts.append(f"{node} is a Raspberry Pi node.")
-            elif node in non_rpi:
-                parts.append(f"{node} is not a Raspberry Pi node.")
-            elif node in names:
-                parts.append(f"{node} is in Atlas but hardware is unknown.")
-            else:
-                parts.append(f"{node} is not in the Atlas cluster.")
-        return " ".join(parts)
+    if not op and entity == "node":
+        op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
 
-    if nodes_in_query and "jetson" in q:
-        jets = set(groups.get("jetson", []))
-        parts = []
-        for node in nodes_in_query:
-            if node in jets:
-                parts.append(f"{node} is a Jetson node.")
-            elif node in names:
-                parts.append(f"{node} is not a Jetson node.")
-            else:
-                parts.append(f"{node} is not in the Atlas cluster.")
-        return " ".join(parts)
+    if op == "top" and metric is None:
+        metric = "cpu"
 
-    if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
-        parts: list[str] = []
-        for node in nodes_in_query:
-            if node in names:
-                parts.append(f"Yes. {node} is in the Atlas cluster.")
-            else:
-                parts.append(f"No. {node} is not in the Atlas cluster.")
-        return " ".join(parts)
-
-    if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
-        non_rpi_sorted = sorted(non_rpi)
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
-        if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
-            amd = sorted(groups.get("amd64", []))
-            return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
-        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."
-
-    if "jetson" in q:
-        jets = groups.get("jetson", [])
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Atlas has {len(jets)} Jetson nodes."
-        return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."
-
-    if "amd64" in q or "x86" in q:
-        amd = groups.get("amd64", [])
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Atlas has {len(amd)} amd64 nodes."
-        return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."
-
-    if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
-        count = sum(1 for node in inventory if node.get("arch") == "arm64")
-        return f"Atlas has {count} arm64 nodes."
-
-    if "rpi4" in q:
-        rpi4 = groups.get("rpi4", [])
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Atlas has {len(rpi4)} rpi4 nodes."
-        return f"rpi4 nodes: {', '.join(rpi4)}." if rpi4 else "No rpi4 nodes found."
-
-    if "rpi5" in q:
-        rpi5 = groups.get("rpi5", [])
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Atlas has {len(rpi5)} rpi5 nodes."
-        return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."
-
-    if "raspberry" in q or "rpi" in q:
-        rpi = sorted(rpi_nodes)
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Atlas has {len(rpi)} Raspberry Pi nodes."
-        return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."
-
-    if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
-        unknown = sorted(unknown_hw)
-        return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."
-
-    if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
-        return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
-
-    if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
-        not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
-        if expected_workers:
-            if "missing" in q:
-                return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
-            if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
-                return (
-                    f"Expected workers: {len(expected_ready)} ready, "
-                    f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
+    # Metrics-first when a metric or top operation is requested.
+    if metric or op == "top":
+        entry = _select_metric_entry(tokens, metric=metric, op=op)
+        if entry and isinstance(entry.get("exprs"), list) and entry["exprs"]:
+            expr = entry["exprs"][0]
+            if inventory:
+                scoped = _inventory_filter(
+                    inventory,
+                    include_hw=include_hw,
+                    exclude_hw=exclude_hw,
+                    only_workers=only_workers,
+                    only_ready=None,
+                    nodes_in_query=nodes_in_query,
                 )
-            if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
-                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
-                if expected_missing:
-                    msg += f" Missing: {', '.join(expected_missing)}."
-                return msg
-            if not_ready_query:
-                if expected_not_ready or expected_missing:
-                    detail = []
-                    if expected_not_ready:
-                        detail.append(f"Not ready: {', '.join(expected_not_ready)}")
-                    if expected_missing:
-                        detail.append(f"Missing: {', '.join(expected_missing)}")
-                    return "Worker nodes needing attention. " + " ".join(detail) + "."
-                return "All expected worker nodes are Ready."
-            if any(word in q for word in ("expected", "expect", "should")):
-                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
-                if expected_missing:
-                    msg += f" Missing: {', '.join(expected_missing)}."
-                return msg
-            if any(word in q for word in ("how many", "count", "number")):
-                return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
-            if "ready" in q:
-                return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
-        if not_ready_query:
-            return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
-        if any(word in q for word in ("how many", "count", "number")):
-            return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."
-        return "Ready worker nodes ({}): {}.".format(len(worker_ready), ", ".join(worker_ready))
+                if scoped:
+                    node_regex = "|".join([n["name"] for n in scoped])
+                    expr = _apply_node_filter(expr, node_regex)
+            res = vm_query(expr, timeout=20)
+            answer = _format_metric_answer(entry, res)
+            if answer:
+                return answer
+        if metrics_summary:
+            return metrics_summary
 
-    if any(word in q for word in ("how many", "count", "number")) and "node" in q:
-        return f"Atlas has {total} nodes; {len(ready)} ready, {len(not_ready)} not ready."
+    if entity != "node" or not inventory:
+        if any(word in q for word in METRIC_HINT_WORDS) and not metrics_summary:
+            return "I don't have data to answer that right now."
+        return ""
 
-    if "node names" in q or ("nodes" in q and "named" in q) or "naming" in q:
-        return "Atlas node names: " + ", ".join(names) + "."
+    expected_workers = expected_worker_nodes_from_metrics()
+    filtered = _inventory_filter(
+        inventory,
+        include_hw=include_hw,
+        exclude_hw=exclude_hw,
+        only_workers=only_workers,
+        only_ready=only_ready if op in ("status", "count") else None,
+        nodes_in_query=nodes_in_query,
+    )
+    names = [node["name"] for node in filtered]
 
-    if "ready" in q and "node" in q:
-        return f"Ready nodes ({len(ready)}): {', '.join(ready)}."
+    if op == "status":
+        if "missing" in q and expected_workers:
+            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
+            return "Missing nodes: " + (", ".join(missing) if missing else "none") + "."
+        if only_ready is False:
+            return "Not ready nodes: " + (", ".join(names) if names else "none") + "."
+        if only_ready is True:
+            return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "."
+
+    if op == "count":
+        if expected_workers and ("expected" in q or "should" in q):
+            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
+            msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+            if missing:
+                msg += f" Missing: {', '.join(missing)}."
+            return msg
+        if not (include_hw or exclude_hw or nodes_in_query or only_workers):
+            return f"Atlas has {len(names)} nodes."
+        return f"Matching nodes: {len(names)}."
+
+    if op == "list":
+        if nodes_in_query:
+            parts = []
+            existing = {n["name"] for n in inventory}
+            for node in nodes_in_query:
+                parts.append(f"{node}: {'present' if node in existing else 'not present'}")
+            return "Node presence: " + ", ".join(parts) + "."
+        if not names:
+            return "Matching nodes: none."
+        shown = names[:30]
+        suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else ""
+        return "Matching nodes: " + ", ".join(shown) + suffix + "."
 
     return ""
 
@@ -727,25 +736,6 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
     fallback = _metrics_fallback_summary(panel, summary)
     return context, fallback
 
-def jetson_nodes_from_kb() -> list[str]:
-    for doc in KB.get("runbooks", []):
-        if not isinstance(doc, dict):
-            continue
-        body = str(doc.get("body") or "")
-        for line in body.splitlines():
-            if "jetson" not in line.lower():
-                continue
-            names = _extract_titan_nodes(line)
-            if names:
-                return names
-    return []
-
-def jetson_nodes_summary(cluster_name: str) -> str:
-    names = jetson_nodes_from_kb()
-    if names:
-        return f"{cluster_name} has {len(names)} Jetson nodes: {', '.join(names)}."
-    return ""
-
 def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
     q = (query or "").strip()
     if not q or not KB.get("catalog"):
@@ -953,22 +943,16 @@ def _parse_metric_lines(summary: str) -> dict[str, str]:
 def _metrics_fallback_summary(panel: str, summary: str) -> str:
     parsed = _parse_metric_lines(summary)
     panel_l = (panel or "").lower()
-    if panel_l.startswith("postgres connections"):
-        used = parsed.get("conn=used")
-        maxv = parsed.get("conn=max")
-        if used and maxv:
-            try:
-                used_i = int(float(used))
-                max_i = int(float(maxv))
-            except ValueError:
-                return f"Postgres connections: {summary}"
-            free = max_i - used_i
-            return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
-    if panel_l.startswith("postgres hottest"):
-        if parsed:
-            label, value = next(iter(parsed.items()))
-            return f"Most Postgres connections: {label} = {value}."
-    return f"{panel}: {summary}"
+    if parsed:
+        items = list(parsed.items())
+        if len(items) == 1:
+            label, value = items[0]
+            return f"{panel}: {label} = {value}."
+        compact = "; ".join(f"{k}={v}" for k, v in items)
+        return f"{panel}: {compact}."
+    if panel_l:
+        return f"{panel}: {summary}"
+    return summary
 
 def _node_ready_status(node: dict) -> bool | None:
     conditions = node.get("status", {}).get("conditions") or []
@@ -1075,93 +1059,6 @@ def vm_cluster_snapshot() -> str:
         parts.append(pr)
     return "\n".join(parts).strip()
 
-def nodes_summary(cluster_name: str) -> str:
-    state = _ariadne_state()
-    if state:
-        nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
-        total = nodes.get("total")
-        ready = nodes.get("ready")
-        not_ready = nodes.get("not_ready")
-        if isinstance(total, int) and isinstance(ready, int):
-            not_ready = not_ready if isinstance(not_ready, int) else max(total - ready, 0)
-            if not_ready:
-                return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
-            return f"{cluster_name} cluster has {total} nodes, all Ready."
-    try:
-        data = k8s_get("/api/v1/nodes?limit=500")
-    except Exception:
-        return ""
-    items = data.get("items") or []
-    if not isinstance(items, list) or not items:
-        return ""
-    total = len(items)
-    ready = 0
-    for node in items:
-        conditions = node.get("status", {}).get("conditions") or []
-        for cond in conditions if isinstance(conditions, list) else []:
-            if cond.get("type") == "Ready":
-                if cond.get("status") == "True":
-                    ready += 1
-                break
-    not_ready = max(total - ready, 0)
-    if not_ready:
-        return f"{cluster_name} cluster has {total} nodes: {ready} Ready, {not_ready} NotReady."
-    return f"{cluster_name} cluster has {total} nodes, all Ready."
-
-def nodes_names_summary(cluster_name: str) -> str:
-    state = _ariadne_state()
-    if state:
-        nodes = state.get("nodes") if isinstance(state.get("nodes"), dict) else {}
-        names = nodes.get("names")
-        if isinstance(names, list) and names:
-            cleaned = sorted({str(n) for n in names if n})
-            if len(cleaned) <= 30:
-                return f"{cluster_name} node names: {', '.join(cleaned)}."
-            shown = ", ".join(cleaned[:30])
-            return f"{cluster_name} node names: {shown}, … (+{len(cleaned) - 30} more)."
-    try:
-        data = k8s_get("/api/v1/nodes?limit=500")
-    except Exception:
-        return ""
-    items = data.get("items") or []
-    if not isinstance(items, list) or not items:
-        return ""
-    names = []
-    for node in items:
-        name = (node.get("metadata") or {}).get("name") or ""
-        if name:
-            names.append(name)
-    names = sorted(set(names))
-    if not names:
-        return ""
-    if len(names) <= 30:
-        return f"{cluster_name} node names: {', '.join(names)}."
-    shown = ", ".join(names[:30])
-    return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."
-
-
-def nodes_arch_summary(cluster_name: str, arch: str) -> str:
-    try:
-        data = k8s_get("/api/v1/nodes?limit=500")
-    except Exception:
-        return ""
-    items = data.get("items") or []
-    if not isinstance(items, list) or not items:
-        return ""
-    normalized = (arch or "").strip().lower()
-    if normalized in ("aarch64", "arm64"):
-        arch_label = "arm64"
-    elif normalized in ("x86_64", "x86-64", "amd64"):
-        arch_label = "amd64"
-    else:
-        arch_label = normalized
-    total = 0
-    for node in items:
-        labels = (node.get("metadata") or {}).get("labels") or {}
-        if labels.get("kubernetes.io/arch") == arch_label:
-            total += 1
-    return f"{cluster_name} cluster has {total} {arch_label} nodes."
-
 def _strip_code_fence(text: str) -> str:
     cleaned = (text or "").strip()
     match = CODE_FENCE_RE.match(cleaned)

From cb8bd3375e70b6f3f55fbb420327f7e4e62e3cbc Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:39:01 -0300
Subject: [PATCH 264/416] comms: restart atlasbot for generic planner

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d5ad62eb..d195e890 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-23
+        checksum/atlasbot-configmap: manual-atlasbot-24
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From d113954f73cd4f4d078f359423261aa0afe796f4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:43:58 -0300
Subject: [PATCH 265/416] atlasbot: add internal endpoint and portal wiring

---
 .../bstein-dev-home/backend-deployment.yaml   |  5 ++
 services/comms/atlasbot-deployment.yaml       |  7 ++-
 services/comms/atlasbot-service.yaml          | 15 +++++
 services/comms/kustomization.yaml             |  1 +
 services/comms/scripts/atlasbot/bot.py        | 58 +++++++++++++++++++
 5 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 services/comms/atlasbot-service.yaml

diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index ecf478cc..26c99e11 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -28,6 +28,7 @@ spec:
           {{ with secret "kv/data/atlas/shared/chat-ai-keys-runtime" }}
           export CHAT_KEY_MATRIX="{{ .Data.data.matrix }}"
           export CHAT_KEY_HOMEPAGE="{{ .Data.data.homepage }}"
+          export AI_ATLASBOT_TOKEN="{{ .Data.data.homepage }}"
           {{ end }}
           {{ with secret "kv/data/atlas/shared/portal-e2e-client" }}
           export PORTAL_E2E_CLIENT_ID="{{ .Data.data.client_id }}"
@@ -66,6 +67,10 @@ spec:
               value: qwen2.5-coder:7b-instruct-q4_0
             - name: AI_CHAT_TIMEOUT_SEC
               value: "480"
+            - name: AI_ATLASBOT_ENDPOINT
+              value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
+            - name: AI_ATLASBOT_TIMEOUT_SEC
+              value: "5"
             - name: AI_NODE_NAME
               valueFrom:
                 fieldRef:
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d195e890..c0596b67 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-24
+        checksum/atlasbot-configmap: manual-atlasbot-25
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -87,6 +87,11 @@ spec:
               value: "480"
             - name: ATLASBOT_THINKING_INTERVAL_SEC
               value: "120"
+            - name: ATLASBOT_HTTP_PORT
+              value: "8090"
+          ports:
+            - name: http
+              containerPort: 8090
           resources:
             requests:
               cpu: 100m
diff --git a/services/comms/atlasbot-service.yaml b/services/comms/atlasbot-service.yaml
new file mode 100644
index 00000000..c8b35705
--- /dev/null
+++ b/services/comms/atlasbot-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: atlasbot
+  namespace: comms
+  labels:
+    app: atlasbot
+spec:
+  selector:
+    app: atlasbot
+  ports:
+    - name: http
+      port: 8090
+      targetPort: 8090
+  type: ClusterIP
diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml
index 37f681de..410f2a69 100644
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@@ -14,6 +14,7 @@ resources:
   - guest-register-deployment.yaml
   - guest-register-service.yaml
   - atlasbot-deployment.yaml
+  - atlasbot-service.yaml
   - wellknown.yaml
   - atlasbot-rbac.yaml
   - mas-secrets-ensure-rbac.yaml
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 987df7a1..deb8e62c 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -5,6 +5,7 @@ import re
 import ssl
 import threading
 import time
+from http.server import BaseHTTPRequestHandler, HTTPServer
 from typing import Any
 from urllib import error, parse, request
 
@@ -1089,6 +1090,62 @@ def _normalize_reply(value: Any) -> str:
     return text
 
 
+# Internal HTTP endpoint for cluster answers (website uses this).
+class _AtlasbotHandler(BaseHTTPRequestHandler):
+    server_version = "AtlasbotHTTP/1.0"
+
+    def _write_json(self, status: int, payload: dict[str, Any]):
+        body = json.dumps(payload).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def _authorized(self) -> bool:
+        if not ATLASBOT_INTERNAL_TOKEN:
+            return True
+        token = self.headers.get("X-Internal-Token", "")
+        return token == ATLASBOT_INTERNAL_TOKEN
+
+    def do_GET(self):  # noqa: N802
+        if self.path == "/health":
+            self._write_json(200, {"status": "ok"})
+            return
+        self._write_json(404, {"error": "not_found"})
+
+    def do_POST(self):  # noqa: N802
+        if self.path != "/v1/answer":
+            self._write_json(404, {"error": "not_found"})
+            return
+        if not self._authorized():
+            self._write_json(401, {"error": "unauthorized"})
+            return
+        try:
+            length = int(self.headers.get("Content-Length", "0"))
+        except ValueError:
+            length = 0
+        raw = self.rfile.read(length) if length > 0 else b""
+        try:
+            payload = json.loads(raw.decode("utf-8")) if raw else {}
+        except json.JSONDecodeError:
+            self._write_json(400, {"error": "invalid_json"})
+            return
+        prompt = str(payload.get("prompt") or payload.get("question") or "").strip()
+        if not prompt:
+            self._write_json(400, {"error": "missing_prompt"})
+            return
+        inventory = node_inventory_live()
+        answer = structured_answer(prompt, inventory=inventory, metrics_summary="")
+        self._write_json(200, {"answer": answer})
+
+
+def _start_http_server():
+    server = HTTPServer(("0.0.0.0", ATLASBOT_HTTP_PORT), _AtlasbotHandler)
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+
+
 # Conversation state.
 history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (short transcript)
 
@@ -1326,6 +1383,7 @@ def login_with_retry():
 
 def main():
     load_kb()
+    _start_http_server()
     token = login_with_retry()
     try:
         room_id = resolve_alias(token, ROOM_ALIAS)

From efe7b9bc5fbd7b7a4dadbf1869323caa862b44e5 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 01:47:43 +0000
Subject: [PATCH 266/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 90c3b8de..fe604b6c 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 9abfcffd8016540a290bdb7d783d5e75696a49e3 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 01:47:47 +0000
Subject: [PATCH 267/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index fe604b6c..f50c38b0 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-157 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 5393585f3e81423e889deac602b36ae78a2fbb1c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:50:12 -0300
Subject: [PATCH 268/416] monitoring: fix jetson metrics newlines

---
 services/monitoring/scripts/jetson_tegrastats_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index c4d3fa2a..c237ec5d 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -65,7 +65,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
         for k, v in metrics.items():
             out.append(f"# TYPE jetson_{k} gauge")
             out.append(f"jetson_{k}{label} {v}")
-        body = "\\n".join(out) + "\\n"
+        body = "\n".join(out) + "\n"
         self.send_response(200)
         self.send_header("Content-Type", "text/plain; version=0.0.4")
         self.send_header("Content-Length", str(len(body)))

From e21bc8ef40166c319cdcf3be1d2b0f23154aa7b5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:51:04 -0300
Subject: [PATCH 269/416] atlasbot: prioritize top queries over list

---
 services/comms/scripts/atlasbot/bot.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index deb8e62c..e6c7542b 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -335,7 +335,11 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
     return any(p in text for p in phrases)
 
 def _detect_operation(q: str) -> str | None:
+    if _has_any(q, OPERATION_HINTS["top"]):
+        return "top"
     for op, phrases in OPERATION_HINTS.items():
+        if op == "top":
+            continue
         if _has_any(q, phrases):
             return op
     return None

From d5478e272ece666a4d5d3fba50190ba53dba4cf5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:51:41 -0300
Subject: [PATCH 270/416] monitoring: restart jetson exporter

---
 services/monitoring/jetson-tegrastats-exporter.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index a8354014..8584ebaa 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,6 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
+        monitoring.bstein.dev/restart-rev: "1"
     spec:
       serviceAccountName: default
       hostPID: true

From 3340b5bf9d61e8eaeb006d214606909f2180af40 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:52:49 -0300
Subject: [PATCH 271/416] comms: restart atlasbot for op priority

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index c0596b67..3ebb8610 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-25
+        checksum/atlasbot-configmap: manual-atlasbot-26
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 8587da0e372672c654d38e13da13540d7cb515b6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Mon, 26 Jan 2026 22:54:43 -0300
Subject: [PATCH 272/416] comms: rerun synapse user seed

---
 services/comms/synapse-user-seed-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/synapse-user-seed-job.yaml
index 7fef796e..aab88c3b 100644
--- a/services/comms/synapse-user-seed-job.yaml
+++ b/services/comms/synapse-user-seed-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-user-seed-7
+  name: synapse-user-seed-8
   namespace: comms
 spec:
   backoffLimit: 1

From c6d811e29d9701db44c2212d2eea0b81c7363cbd Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 02:52:49 +0000
Subject: [PATCH 273/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index f50c38b0..d6208c42 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From a9b6729eb2282f6a3935dea74f2b2f1ef8699c16 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 02:53:50 +0000
Subject: [PATCH 274/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index d6208c42..a520991b 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-158 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 1b6fac86fbe46c5228d8162788fb88583949b4a7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 01:04:38 -0300
Subject: [PATCH 275/416] vault: bootstrap k8s auth config with root token

---
 services/vault/k8s-auth-config-cronjob.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/services/vault/k8s-auth-config-cronjob.yaml b/services/vault/k8s-auth-config-cronjob.yaml
index 43da16b4..5a2d6829 100644
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@@ -34,6 +34,11 @@ spec:
                   value: http://10.43.57.249:8200
                 - name: VAULT_K8S_ROLE
                   value: vault-admin
+                - name: VAULT_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: vault-init
+                      key: root_token
                 - name: VAULT_K8S_TOKEN_REVIEWER_JWT_FILE
                   value: /var/run/secrets/vault-token-reviewer/token
                 - name: VAULT_K8S_ROLE_TTL

From e622b1ae09f7fa06593880208ad4c5f57cdc9109 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 01:14:42 -0300
Subject: [PATCH 276/416] comms: rerun ensure jobs and fix vault oidc env

---
 services/comms/comms-secrets-ensure-job.yaml   | 2 +-
 services/comms/mas-local-users-ensure-job.yaml | 2 +-
 services/maintenance/ariadne-deployment.yaml   | 4 ++++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/comms-secrets-ensure-job.yaml
index b71dd403..52904cc9 100644
--- a/services/comms/comms-secrets-ensure-job.yaml
+++ b/services/comms/comms-secrets-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: comms-secrets-ensure-6
+  name: comms-secrets-ensure-7
   namespace: comms
 spec:
   backoffLimit: 1
diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml
index c8cf5f04..d385b473 100644
--- a/services/comms/mas-local-users-ensure-job.yaml
+++ b/services/comms/mas-local-users-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-16
+  name: mas-local-users-ensure-17
   namespace: comms
 spec:
   backoffLimit: 1
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 33b8a12a..6fa638d3 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -89,7 +89,11 @@ spec:
           export VAULT_OIDC_USER_POLICIES="{{ .Data.data.user_policies }}"
           export VAULT_OIDC_REDIRECT_URIS="{{ .Data.data.redirect_uris }}"
           export VAULT_OIDC_BOUND_AUDIENCES="{{ .Data.data.bound_audiences }}"
+          {{- if .Data.data.bound_claims_type }}
           export VAULT_OIDC_BOUND_CLAIMS_TYPE="{{ .Data.data.bound_claims_type }}"
+          {{- else }}
+          export VAULT_OIDC_BOUND_CLAIMS_TYPE="string"
+          {{- end }}
           {{ end }}
     spec:
       serviceAccountName: ariadne

From b2a464b80a9172a5affcca8f27f80956fc85e5a2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 01:19:43 -0300
Subject: [PATCH 277/416] comms: rerun mas local user ensure

---
 services/comms/mas-local-users-ensure-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/mas-local-users-ensure-job.yaml
index d385b473..636ee5bb 100644
--- a/services/comms/mas-local-users-ensure-job.yaml
+++ b/services/comms/mas-local-users-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-17
+  name: mas-local-users-ensure-18
   namespace: comms
 spec:
   backoffLimit: 1

From 2d996ffd6ec916d2fcdd806d12248e08e92207da Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 01:22:02 -0300
Subject: [PATCH 278/416] comms: rerun synapse seeder admin ensure

---
 services/comms/synapse-seeder-admin-ensure-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/synapse-seeder-admin-ensure-job.yaml
index ce8ccd35..5d2d4225 100644
--- a/services/comms/synapse-seeder-admin-ensure-job.yaml
+++ b/services/comms/synapse-seeder-admin-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-seeder-admin-ensure-8
+  name: synapse-seeder-admin-ensure-9
   namespace: comms
 spec:
   backoffLimit: 2

From 3579e906b43e9484389ab52b6fc90c99e7e83ebf Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 06:51:28 +0000
Subject: [PATCH 279/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 7528f6f3..c8f9f2c8 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-51 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-54 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From ad6c927370a966b0aedafc3f045c4735c8721a69 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 03:56:00 -0300
Subject: [PATCH 280/416] atlasbot: clarify scoped metrics and format percent
 values

---
 services/comms/scripts/atlasbot/bot.py | 57 ++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e6c7542b..f8b3ccff 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -406,15 +406,56 @@ def _apply_node_filter(expr: str, node_regex: str | None) -> str:
     replacement = f'node_uname_info{{nodename!=\"\",nodename=~\"{node_regex}\"}}'
     return expr.replace(needle, replacement)
 
+def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool:
+    exprs = entry.get("exprs")
+    expr = exprs[0] if isinstance(exprs, list) and exprs else ""
+    return "* 100" in expr or "*100" in expr
+
+
+def _format_metric_value(value: str, *, percent: bool) -> str:
+    try:
+        num = float(value)
+    except (TypeError, ValueError):
+        return value
+    if percent:
+        return f"{num:.1f}%"
+    if abs(num) >= 1:
+        return f"{num:.2f}".rstrip("0").rstrip(".")
+    return f"{num:.4f}".rstrip("0").rstrip(".")
+
+
+def _format_metric_label(metric: dict[str, Any]) -> str:
+    label_parts = []
+    for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"):
+        if metric.get(k):
+            label_parts.append(f"{k}={metric.get(k)}")
+    if not label_parts:
+        for k in sorted(metric.keys()):
+            if k.startswith("__"):
+                continue
+            label_parts.append(f"{k}={metric.get(k)}")
+            if len(label_parts) >= 4:
+                break
+    return ", ".join(label_parts) if label_parts else "series"
+
+
 def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
     series = _vm_value_series(res)
     panel = entry.get("panel_title") or "Metric"
     if not series:
         return ""
-    rendered = vm_render_result(res, limit=5)
-    if not rendered:
+    percent = _metric_expr_uses_percent(entry)
+    lines: list[str] = []
+    for r in series[:5]:
+        if not isinstance(r, dict):
+            continue
+        metric = r.get("metric") or {}
+        value = r.get("value") or []
+        val = value[1] if isinstance(value, list) and len(value) > 1 else ""
+        label = _format_metric_label(metric if isinstance(metric, dict) else {})
+        lines.append(f"{label}: {_format_metric_value(val, percent=percent)}")
+    if not lines:
         return ""
-    lines = [line.lstrip("-").strip() for line in rendered.splitlines() if line.strip().startswith("-")]
     if len(lines) == 1:
         return f"{panel}: {lines[0]}."
     return f"{panel}:\n" + "\n".join(f"- {line}" for line in lines)
@@ -627,6 +668,16 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
             res = vm_query(expr, timeout=20)
             answer = _format_metric_answer(entry, res)
             if answer:
+                scope_parts: list[str] = []
+                if include_hw:
+                    scope_parts.append(" and ".join(sorted(include_hw)))
+                if exclude_hw:
+                    scope_parts.append(f"excluding {' and '.join(sorted(exclude_hw))}")
+                if only_workers:
+                    scope_parts.append("worker")
+                if scope_parts:
+                    scope = " ".join(scope_parts)
+                    return f"Among {scope} nodes, {answer}"
                 return answer
         if metrics_summary:
             return metrics_summary

From 16a059134ae2b10ed237f929596c33c5af50b6bd Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 03:56:47 -0300
Subject: [PATCH 281/416] comms: restart atlasbot for metrics formatting

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 3ebb8610..83e0b2ed 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-26
+        checksum/atlasbot-configmap: manual-atlasbot-27
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 00c0375790148e4e2c41e885eedf854578ddef29 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 04:48:44 -0300
Subject: [PATCH 282/416] comms: add synapse admin ensure job

---
 services/comms/kustomization.yaml             |   1 +
 services/comms/synapse-admin-ensure-job.yaml  | 177 ++++++++++++++++++
 services/maintenance/ariadne-deployment.yaml  |   3 +
 .../vault/scripts/vault_k8s_auth_configure.sh |   4 +-
 4 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 services/comms/synapse-admin-ensure-job.yaml

diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml
index 410f2a69..01d7be5c 100644
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@@ -25,6 +25,7 @@ resources:
   - mas-admin-client-secret-ensure-job.yaml
   - mas-db-ensure-job.yaml
   - comms-secrets-ensure-job.yaml
+  - synapse-admin-ensure-job.yaml
   - synapse-signingkey-ensure-job.yaml
   - synapse-seeder-admin-ensure-job.yaml
   - synapse-user-seed-job.yaml
diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml
new file mode 100644
index 00000000..be9e0fd1
--- /dev/null
+++ b/services/comms/synapse-admin-ensure-job.yaml
@@ -0,0 +1,177 @@
+# services/comms/synapse-admin-ensure-job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: synapse-admin-ensure-1
+  namespace: comms
+spec:
+  backoffLimit: 1
+  ttlSecondsAfterFinished: 3600
+  template:
+    spec:
+      serviceAccountName: comms-secrets-ensure
+      restartPolicy: OnFailure
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["arm64"]
+      containers:
+        - name: ensure
+          image: python:3.11-slim
+          env:
+            - name: VAULT_ADDR
+              value: http://vault.vault.svc.cluster.local:8200
+            - name: VAULT_ROLE
+              value: comms-secrets
+            - name: SYNAPSE_ADMIN_URL
+              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -euo pipefail
+              python - <<'PY'
+              import base64
+              import hashlib
+              import hmac
+              import json
+              import os
+              import secrets
+              import string
+              import urllib.error
+              import urllib.request
+
+              VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
+              VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
+              SYNAPSE_ADMIN_URL = os.environ.get(
+                  "SYNAPSE_ADMIN_URL",
+                  "http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008",
+              ).rstrip("/")
+              SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+
+              def log(msg: str) -> None:
+                  print(msg, flush=True)
+
+              def request_json(url: str, payload: dict | None = None) -> dict:
+                  data = None
+                  headers = {"Content-Type": "application/json"}
+                  if payload is not None:
+                      data = json.dumps(payload).encode("utf-8")
+                  req = urllib.request.Request(url, data=data, headers=headers, method="POST" if data else "GET")
+                  with urllib.request.urlopen(req, timeout=30) as resp:
+                      return json.loads(resp.read().decode("utf-8"))
+
+              def vault_login() -> str:
+                  with open(SA_TOKEN_PATH, "r", encoding="utf-8") as f:
+                      jwt = f.read().strip()
+                  payload = {"jwt": jwt, "role": VAULT_ROLE}
+                  resp = request_json(f"{VAULT_ADDR}/v1/auth/kubernetes/login", payload)
+                  token = resp.get("auth", {}).get("client_token")
+                  if not token:
+                      raise RuntimeError("vault login failed")
+                  return token
+
+              def vault_get(token: str, path: str) -> dict:
+                  req = urllib.request.Request(
+                      f"{VAULT_ADDR}/v1/kv/data/atlas/{path}",
+                      headers={"X-Vault-Token": token},
+                  )
+                  try:
+                      with urllib.request.urlopen(req, timeout=30) as resp:
+                          payload = json.loads(resp.read().decode("utf-8"))
+                          return payload.get("data", {}).get("data", {})
+                  except urllib.error.HTTPError as exc:
+                      if exc.code == 404:
+                          return {}
+                      raise
+
+              def vault_put(token: str, path: str, data: dict) -> None:
+                  payload = {"data": data}
+                  req = urllib.request.Request(
+                      f"{VAULT_ADDR}/v1/kv/data/atlas/{path}",
+                      data=json.dumps(payload).encode("utf-8"),
+                      headers={"X-Vault-Token": token, "Content-Type": "application/json"},
+                      method="POST",
+                  )
+                  with urllib.request.urlopen(req, timeout=30) as resp:
+                      resp.read()
+
+              def random_password(length: int = 32) -> str:
+                  alphabet = string.ascii_letters + string.digits
+                  return "".join(secrets.choice(alphabet) for _ in range(length))
+
+              def ensure_registration_secret(token: str) -> str:
+                  data = vault_get(token, "comms/synapse-registration")
+                  secret = (data.get("registration_shared_secret") or "").strip()
+                  if not secret:
+                      secret = secrets.token_urlsafe(32)
+                      data["registration_shared_secret"] = secret
+                      vault_put(token, "comms/synapse-registration", data)
+                      log("registration secret created")
+                  return secret
+
+              def ensure_admin_creds(token: str) -> dict:
+                  data = vault_get(token, "comms/synapse-admin")
+                  username = (data.get("username") or "").strip() or "synapse-admin"
+                  password = (data.get("password") or "").strip()
+                  if not password:
+                      password = random_password()
+                  data["username"] = username
+                  data["password"] = password
+                  vault_put(token, "comms/synapse-admin", data)
+                  return data
+
+              def register_admin(secret: str, username: str, password: str) -> str:
+                  nonce_payload = request_json(f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register")
+                  nonce = nonce_payload.get("nonce")
+                  if not nonce:
+                      raise RuntimeError("synapse register nonce missing")
+                  admin_flag = "admin"
+                  user_type = ""
+                  mac_payload = "\x00".join([nonce, username, password, admin_flag, user_type])
+                  mac = hmac.new(secret.encode("utf-8"), mac_payload.encode("utf-8"), hashlib.sha1).hexdigest()
+                  payload = {
+                      "nonce": nonce,
+                      "username": username,
+                      "password": password,
+                      "admin": True,
+                      "mac": mac,
+                  }
+                  req = urllib.request.Request(
+                      f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register",
+                      data=json.dumps(payload).encode("utf-8"),
+                      headers={"Content-Type": "application/json"},
+                      method="POST",
+                  )
+                  try:
+                      with urllib.request.urlopen(req, timeout=30) as resp:
+                          payload = json.loads(resp.read().decode("utf-8"))
+                  except urllib.error.HTTPError as exc:
+                      body = exc.read().decode("utf-8")
+                      raise RuntimeError(f"synapse admin register failed: {exc.code} {body}") from exc
+                  access_token = payload.get("access_token")
+                  if not access_token:
+                      raise RuntimeError("synapse admin token missing")
+                  return access_token
+
+              vault_token = vault_login()
+              reg_secret = ensure_registration_secret(vault_token)
+              admin_data = ensure_admin_creds(vault_token)
+              if admin_data.get("access_token"):
+                  log("synapse admin token already present")
+                  raise SystemExit(0)
+              access_token = register_admin(reg_secret, admin_data["username"], admin_data["password"])
+              admin_data["access_token"] = access_token
+              vault_put(vault_token, "comms/synapse-admin", admin_data)
+              log("synapse admin user ensured")
+              PY
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index 6fa638d3..fce1ded5 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -69,6 +69,9 @@ spec:
           export COMMS_BOT_PASSWORD="{{ index .Data.data "bot-password" }}"
           export COMMS_SEEDER_PASSWORD="{{ index .Data.data "seeder-password" }}"
           {{ end }}
+          {{ with secret "kv/data/atlas/comms/synapse-admin" }}
+          export COMMS_SYNAPSE_ADMIN_TOKEN="{{ .Data.data.access_token }}"
+          {{ end }}
           {{ with secret "kv/data/atlas/comms/synapse-db" }}
           export COMMS_SYNAPSE_DB_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}"
           {{ end }}
diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh
index 21132c79..0212180f 100644
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@@ -231,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
   "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db vault/vault-oidc-config shared/harbor-pull" ""
+  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
   "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
@@ -253,4 +253,4 @@ write_policy_and_role "crypto-secrets" "crypto" "crypto-secrets-ensure" \
 write_policy_and_role "comms-secrets" "comms" \
   "comms-secrets-ensure,mas-db-ensure,mas-admin-client-secret-writer,othrys-synapse-signingkey-job" \
   "" \
-  "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey"
+  "comms/turn-shared-secret comms/livekit-api comms/synapse-redis comms/synapse-macaroon comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin comms/synapse-registration comms/mas-db comms/mas-admin-client-runtime comms/mas-secrets-runtime comms/othrys-synapse-signingkey"

From 64e59a9b77a0043d5e76f923959d244411eb67e3 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 04:51:20 -0300
Subject: [PATCH 283/416] atlasbot: add knowledge summaries and better fallback

---
 services/comms/scripts/atlasbot/bot.py | 110 +++++++++++++++++++++++--
 1 file changed, 103 insertions(+), 7 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index f8b3ccff..3a1a0002 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -254,14 +254,14 @@ def load_kb():
     _NAME_INDEX = names
     _METRIC_INDEX = metrics if isinstance(metrics, list) else []
 
-def kb_retrieve(query: str, *, limit: int = 3) -> str:
+def _score_kb_docs(query: str) -> list[dict[str, Any]]:
     q = (query or "").strip()
     if not q or not KB.get("runbooks"):
-        return ""
+        return []
     ql = q.lower()
     q_tokens = _tokens(q)
     if not q_tokens:
-        return ""
+        return []
 
     scored: list[tuple[int, dict]] = []
     for doc in KB.get("runbooks", []):
@@ -281,9 +281,16 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
                 score += 4
         if score:
             scored.append((score, doc))
-
     scored.sort(key=lambda x: x[0], reverse=True)
-    picked = [d for _, d in scored[:limit]]
+    return [d for _, d in scored]
+
+
+def kb_retrieve(query: str, *, limit: int = 3) -> str:
+    q = (query or "").strip()
+    if not q:
+        return ""
+    scored = _score_kb_docs(q)
+    picked = scored[:limit]
     if not picked:
         return ""
 
@@ -301,6 +308,22 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
         used += len(chunk)
     return "\n".join(parts).strip()
 
+
+def kb_retrieve_titles(query: str, *, limit: int = 4) -> str:
+    scored = _score_kb_docs(query)
+    picked = scored[:limit]
+    if not picked:
+        return ""
+    parts = ["Relevant runbooks:"]
+    for doc in picked:
+        title = doc.get("title") or doc.get("path") or "runbook"
+        path = doc.get("path") or ""
+        if path:
+            parts.append(f"- {title} ({path})")
+        else:
+            parts.append(f"- {title}")
+    return "\n".join(parts)
+
 def _extract_titan_nodes(text: str) -> list[str]:
     cleaned = normalize_query(text)
     names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n}
@@ -439,6 +462,18 @@ def _format_metric_label(metric: dict[str, Any]) -> str:
     return ", ".join(label_parts) if label_parts else "series"
 
 
+def _primary_series_metric(res: dict | None) -> tuple[str | None, str | None]:
+    series = _vm_value_series(res or {})
+    if not series:
+        return (None, None)
+    first = series[0]
+    metric = first.get("metric") if isinstance(first, dict) else {}
+    value = first.get("value") if isinstance(first, dict) else []
+    node = metric.get("node") if isinstance(metric, dict) else None
+    val = value[1] if isinstance(value, list) and len(value) > 1 else None
+    return (node, val)
+
+
 def _format_metric_answer(entry: dict[str, Any], res: dict | None) -> str:
     series = _vm_value_series(res)
     panel = entry.get("panel_title") or "Metric"
@@ -677,7 +712,15 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
                     scope_parts.append("worker")
                 if scope_parts:
                     scope = " ".join(scope_parts)
-                    return f"Among {scope} nodes, {answer}"
+                    overall_note = ""
+                    base_res = vm_query(entry["exprs"][0], timeout=20)
+                    base_node, base_val = _primary_series_metric(base_res)
+                    scoped_node, scoped_val = _primary_series_metric(res)
+                    if base_node and scoped_node and base_node != scoped_node:
+                        percent = _metric_expr_uses_percent(entry)
+                        base_val_fmt = _format_metric_value(base_val or "", percent=percent)
+                        overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})."
+                    return f"Among {scope} nodes, {answer}{overall_note}"
                 return answer
         if metrics_summary:
             return metrics_summary
@@ -1075,7 +1118,7 @@ def _context_fallback(context: str) -> str:
     trimmed = context.strip()
     if len(trimmed) > MAX_TOOL_CHARS:
         trimmed = trimmed[: MAX_TOOL_CHARS - 3].rstrip() + "..."
-    return "I couldn’t reach the model backend. Here is the data I found:\n" + trimmed
+    return "Here is what I found:\n" + trimmed
 
 def vm_top_restarts(hours: int = 1) -> str:
     q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
@@ -1192,6 +1235,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             return
         inventory = node_inventory_live()
         answer = structured_answer(prompt, inventory=inventory, metrics_summary="")
+        if not answer and _knowledge_intent(prompt):
+            answer = knowledge_summary(prompt, inventory)
+        if not answer:
+            kb = kb_retrieve_titles(prompt, limit=4)
+            answer = kb or ""
         self._write_json(200, {"answer": answer})
 
 
@@ -1257,6 +1305,48 @@ def build_context(
 
     return "\n\n".join([p for p in parts if p]).strip()
 
+
+def _knowledge_intent(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    return any(
+        phrase in q
+        for phrase in (
+            "what do you know",
+            "tell me about",
+            "overview",
+            "summary",
+            "describe",
+            "explain",
+            "what is",
+        )
+    )
+
+
+def _inventory_summary(inventory: list[dict[str, Any]]) -> str:
+    if not inventory:
+        return ""
+    groups = _group_nodes(inventory)
+    total = len(inventory)
+    ready = [n for n in inventory if n.get("ready") is True]
+    not_ready = [n for n in inventory if n.get("ready") is False]
+    parts = [f"Atlas cluster: {total} nodes ({len(ready)} ready, {len(not_ready)} not ready)."]
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        nodes = groups.get(key) or []
+        if nodes:
+            parts.append(f"- {key}: {len(nodes)} nodes ({', '.join(nodes)})")
+    return "\n".join(parts)
+
+
+def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str:
+    parts: list[str] = []
+    inv = _inventory_summary(inventory)
+    if inv:
+        parts.append(inv)
+    kb_titles = kb_retrieve_titles(prompt, limit=4)
+    if kb_titles:
+        parts.append(kb_titles)
+    return "\n".join(parts).strip()
+
 def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
     system = (
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
@@ -1416,6 +1506,12 @@ def sync_loop(token: str, room_id: str):
                     send_msg(token, rid, structured)
                     continue
 
+                if _knowledge_intent(body):
+                    summary = knowledge_summary(body, inventory)
+                    if summary:
+                        send_msg(token, rid, summary)
+                        continue
+
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From 7c14fe7b3c0541d4223b98da47087f08e97b9d4f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 04:51:33 -0300
Subject: [PATCH 284/416] comms: restart atlasbot for knowledge summaries

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 83e0b2ed..5198f2a9 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-27
+        checksum/atlasbot-configmap: manual-atlasbot-28
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 89949835d95a6839f850a71eef9e524cea4c7ee3 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 04:53:33 -0300
Subject: [PATCH 285/416] atlasbot: scope overall hottest node to atlas
 inventory

---
 services/comms/scripts/atlasbot/bot.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 3a1a0002..8df1317d 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -713,7 +713,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
                 if scope_parts:
                     scope = " ".join(scope_parts)
                     overall_note = ""
-                    base_res = vm_query(entry["exprs"][0], timeout=20)
+                    base_expr = entry["exprs"][0]
+                    if inventory:
+                        all_nodes = "|".join([n["name"] for n in inventory])
+                        if all_nodes:
+                            base_expr = _apply_node_filter(base_expr, all_nodes)
+                    base_res = vm_query(base_expr, timeout=20)
                     base_node, base_val = _primary_series_metric(base_res)
                     scoped_node, scoped_val = _primary_series_metric(res)
                     if base_node and scoped_node and base_node != scoped_node:

From 48b5b018cad32af9b1439a3d597c58e8a2a617d9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 04:53:44 -0300
Subject: [PATCH 286/416] comms: restart atlasbot for scoped hottest

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 5198f2a9..e35fa619 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-28
+        checksum/atlasbot-configmap: manual-atlasbot-29
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 4083c3dcfafea08ce4d6402d3cb62244976a4249 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 04:58:13 -0300
Subject: [PATCH 287/416] comms: ensure synapse admin token

---
 services/comms/synapse-admin-ensure-job.yaml | 141 ++++++++++++-------
 1 file changed, 89 insertions(+), 52 deletions(-)

diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml
index be9e0fd1..6ddea830 100644
--- a/services/comms/synapse-admin-ensure-job.yaml
+++ b/services/comms/synapse-admin-ensure-job.yaml
@@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-admin-ensure-1
+  name: synapse-admin-ensure-2
   namespace: comms
 spec:
   backoffLimit: 1
@@ -40,24 +40,26 @@ spec:
             - -c
             - |
               set -euo pipefail
+              pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null
               python - <<'PY'
-              import base64
-              import hashlib
-              import hmac
               import json
               import os
               import secrets
               import string
+              import time
               import urllib.error
               import urllib.request
 
+              import bcrypt
+              import psycopg2
+
               VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
               VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
-              SYNAPSE_ADMIN_URL = os.environ.get(
-                  "SYNAPSE_ADMIN_URL",
-                  "http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008",
-              ).rstrip("/")
               SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+              PGHOST = "postgres-service.postgres.svc.cluster.local"
+              PGPORT = 5432
+              PGDATABASE = "synapse"
+              PGUSER = "synapse"
 
               def log(msg: str) -> None:
                   print(msg, flush=True)
@@ -110,16 +112,6 @@ spec:
                   alphabet = string.ascii_letters + string.digits
                   return "".join(secrets.choice(alphabet) for _ in range(length))
 
-              def ensure_registration_secret(token: str) -> str:
-                  data = vault_get(token, "comms/synapse-registration")
-                  secret = (data.get("registration_shared_secret") or "").strip()
-                  if not secret:
-                      secret = secrets.token_urlsafe(32)
-                      data["registration_shared_secret"] = secret
-                      vault_put(token, "comms/synapse-registration", data)
-                      log("registration secret created")
-                  return secret
-
               def ensure_admin_creds(token: str) -> dict:
                   data = vault_get(token, "comms/synapse-admin")
                   username = (data.get("username") or "").strip() or "synapse-admin"
@@ -131,47 +123,92 @@ spec:
                   vault_put(token, "comms/synapse-admin", data)
                   return data
 
-              def register_admin(secret: str, username: str, password: str) -> str:
-                  nonce_payload = request_json(f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register")
-                  nonce = nonce_payload.get("nonce")
-                  if not nonce:
-                      raise RuntimeError("synapse register nonce missing")
-                  admin_flag = "admin"
-                  user_type = ""
-                  mac_payload = "\x00".join([nonce, username, password, admin_flag, user_type])
-                  mac = hmac.new(secret.encode("utf-8"), mac_payload.encode("utf-8"), hashlib.sha1).hexdigest()
-                  payload = {
-                      "nonce": nonce,
-                      "username": username,
-                      "password": password,
-                      "admin": True,
-                      "mac": mac,
+              def ensure_user(cur, cols, user_id, password, admin):
+                  now_ms = int(time.time() * 1000)
+                  values = {
+                      "name": user_id,
+                      "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
+                      "creation_ts": now_ms,
                   }
-                  req = urllib.request.Request(
-                      f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v1/register",
-                      data=json.dumps(payload).encode("utf-8"),
-                      headers={"Content-Type": "application/json"},
-                      method="POST",
+
+                  def add_flag(name, flag):
+                      if name not in cols:
+                          return
+                      if cols[name]["type"] in ("smallint", "integer"):
+                          values[name] = int(flag)
+                      else:
+                          values[name] = bool(flag)
+
+                  add_flag("admin", admin)
+                  add_flag("deactivated", False)
+                  add_flag("shadow_banned", False)
+                  add_flag("is_guest", False)
+
+                  columns = list(values.keys())
+                  placeholders = ", ".join(["%s"] * len(columns))
+                  updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
+                  query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
+                  cur.execute(query, [values[c] for c in columns])
+
+              def get_cols(cur):
+                  cur.execute(
+                      """
+                      SELECT column_name, is_nullable, column_default, data_type
+                      FROM information_schema.columns
+                      WHERE table_schema = 'public' AND table_name = 'users'
+                      """
+                  )
+                  cols = {}
+                  for name, is_nullable, default, data_type in cur.fetchall():
+                      cols[name] = {
+                          "nullable": is_nullable == "YES",
+                          "default": default,
+                          "type": data_type,
+                      }
+                  return cols
+
+              def ensure_access_token(cur, user_id, token_value):
+                  cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
+                  token_id = cur.fetchone()[0]
+                  cur.execute(
+                      """
+                      INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
+                      VALUES (%s, %s, %s, %s, NULL)
+                      ON CONFLICT (token) DO NOTHING
+                      """,
+                      (token_id, user_id, token_value, "ariadne-admin"),
                   )
-                  try:
-                      with urllib.request.urlopen(req, timeout=30) as resp:
-                          payload = json.loads(resp.read().decode("utf-8"))
-                  except urllib.error.HTTPError as exc:
-                      body = exc.read().decode("utf-8")
-                      raise RuntimeError(f"synapse admin register failed: {exc.code} {body}") from exc
-                  access_token = payload.get("access_token")
-                  if not access_token:
-                      raise RuntimeError("synapse admin token missing")
-                  return access_token
 
               vault_token = vault_login()
-              reg_secret = ensure_registration_secret(vault_token)
               admin_data = ensure_admin_creds(vault_token)
               if admin_data.get("access_token"):
                   log("synapse admin token already present")
                   raise SystemExit(0)
-              access_token = register_admin(reg_secret, admin_data["username"], admin_data["password"])
-              admin_data["access_token"] = access_token
+
+              synapse_db = vault_get(vault_token, "comms/synapse-db")
+              pg_password = synapse_db.get("POSTGRES_PASSWORD")
+              if not pg_password:
+                  raise RuntimeError("synapse db password missing")
+
+              user_id = f"@{admin_data['username']}:live.bstein.dev"
+              conn = psycopg2.connect(
+                  host=PGHOST,
+                  port=PGPORT,
+                  dbname=PGDATABASE,
+                  user=PGUSER,
+                  password=pg_password,
+              )
+              token_value = secrets.token_urlsafe(32)
+              try:
+                  with conn:
+                      with conn.cursor() as cur:
+                          cols = get_cols(cur)
+                          ensure_user(cur, cols, user_id, admin_data["password"], True)
+                          ensure_access_token(cur, user_id, token_value)
+              finally:
+                  conn.close()
+
+              admin_data["access_token"] = token_value
               vault_put(vault_token, "comms/synapse-admin", admin_data)
-              log("synapse admin user ensured")
+              log("synapse admin token stored")
               PY

From 47f049d39260aefbf2b9fff73e651618ac997414 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 05:02:02 -0300
Subject: [PATCH 288/416] comms: retain synapse admin ensure logs

---
 services/comms/synapse-admin-ensure-job.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/synapse-admin-ensure-job.yaml
index 6ddea830..5ddf60c4 100644
--- a/services/comms/synapse-admin-ensure-job.yaml
+++ b/services/comms/synapse-admin-ensure-job.yaml
@@ -2,15 +2,15 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-admin-ensure-2
+  name: synapse-admin-ensure-3
   namespace: comms
 spec:
-  backoffLimit: 1
+  backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
     spec:
       serviceAccountName: comms-secrets-ensure
-      restartPolicy: OnFailure
+      restartPolicy: Never
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
@@ -40,7 +40,7 @@ spec:
             - -c
             - |
               set -euo pipefail
-              pip install --no-cache-dir psycopg2-binary bcrypt >/dev/null
+              pip install --no-cache-dir psycopg2-binary bcrypt
               python - <<'PY'
               import json
               import os

From aed70963cc54514c8b3a9739fbaea86515e0a602 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 08:14:36 +0000
Subject: [PATCH 289/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index c8f9f2c8..1392855b 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-54 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-56 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From ef7946b4f27523f8ae638a7c26c65b18773cf608 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 05:41:58 -0300
Subject: [PATCH 290/416] atlasbot: use cluster snapshot + model update

---
 services/ai-llm/deployment.yaml         |   4 +-
 services/comms/atlasbot-deployment.yaml |   6 +-
 services/comms/scripts/atlasbot/bot.py  | 368 +++++++++++++++++++++---
 3 files changed, 334 insertions(+), 44 deletions(-)

diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml
index 4f34d866..43d14c81 100644
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@@ -20,7 +20,7 @@ spec:
       labels:
         app: ollama
       annotations:
-        ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
+        ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0
         ai.bstein.dev/gpu: GPU pool (titan-22/24)
         ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
     spec:
@@ -52,7 +52,7 @@ spec:
             - name: OLLAMA_MODELS
               value: /root/.ollama
             - name: OLLAMA_MODEL
-              value: qwen2.5-coder:7b-instruct-q4_0
+              value: qwen2.5:7b-instruct-q4_0
           command:
             - /bin/sh
             - -c
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index e35fa619..0ee86f01 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -82,11 +82,13 @@ spec:
             - name: OLLAMA_URL
               value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
-              value: qwen2.5-coder:7b-instruct-q4_0
+              value: qwen2.5:7b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC
-              value: "480"
+              value: "600"
             - name: ATLASBOT_THINKING_INTERVAL_SEC
               value: "120"
+            - name: ATLASBOT_SNAPSHOT_TTL_SEC
+              value: "30"
             - name: ATLASBOT_HTTP_PORT
               value: "8090"
           ports:
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 8df1317d..9f6c38dc 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -21,6 +21,7 @@ API_KEY = os.environ.get("CHAT_API_KEY", "")
 OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
 ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090"))
 ATLASBOT_INTERNAL_TOKEN = os.environ.get("ATLASBOT_INTERNAL_TOKEN") or os.environ.get("CHAT_API_HOMEPAGE", "")
+SNAPSHOT_TTL_SEC = int(os.environ.get("ATLASBOT_SNAPSHOT_TTL_SEC", "30"))
 
 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
@@ -523,7 +524,7 @@ def _hardware_match(node: dict[str, Any], filters: set[str]) -> bool:
     hw = node.get("hardware") or ""
     arch = node.get("arch") or ""
     for f in filters:
-        if f == "rpi" and hw in ("rpi4", "rpi5"):
+        if f == "rpi" and hw in ("rpi4", "rpi5", "rpi"):
             return True
         if f == "arm64" and arch == "arm64":
             return True
@@ -546,7 +547,7 @@ def _hardware_class(labels: dict[str, Any]) -> str:
     if str(labels.get("jetson") or "").lower() == "true":
         return "jetson"
     hardware = (labels.get("hardware") or "").strip().lower()
-    if hardware in ("rpi4", "rpi5"):
+    if hardware in ("rpi4", "rpi5", "rpi"):
         return hardware
     arch = labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or ""
     if arch == "amd64":
@@ -580,6 +581,14 @@ def node_inventory_live() -> list[dict[str, Any]]:
         )
     return sorted(inventory, key=lambda item: item["name"])
 
+
+def node_inventory() -> list[dict[str, Any]]:
+    snapshot = _snapshot_state()
+    inventory = _snapshot_inventory(snapshot)
+    if inventory:
+        return inventory
+    return node_inventory_live()
+
 def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
     grouped: dict[str, list[str]] = collections.defaultdict(list)
     for node in inventory:
@@ -591,7 +600,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None =
     if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
         return ""
     if inventory is None:
-        inventory = node_inventory_live()
+        inventory = node_inventory()
     if not inventory:
         return ""
     groups = _group_nodes(inventory)
@@ -626,7 +635,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None =
 def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
     q = normalize_query(prompt)
     if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
-        return node_inventory_live()
+        return node_inventory()
     return []
 
 def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
@@ -656,11 +665,177 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
         "expected_missing": sorted(expected_missing),
     }
 
-def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
+
+def _workload_tokens(entry: dict[str, Any]) -> set[str]:
+    tokens: set[str] = set()
+    for key in ("workload", "namespace"):
+        value = entry.get(key)
+        if isinstance(value, str) and value:
+            tokens.update(_tokens(value))
+    return tokens
+
+
+def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None:
+    q_tokens = set(_tokens(prompt))
+    if not q_tokens:
+        return None
+    scored: list[tuple[int, dict[str, Any]]] = []
+    for entry in workloads:
+        if not isinstance(entry, dict):
+            continue
+        tokens = _workload_tokens(entry)
+        score = len(tokens & q_tokens)
+        if score:
+            scored.append((score, entry))
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return scored[0][1]
+
+
+def _format_confidence(answer: str, confidence: str) -> str:
+    if not answer:
+        return ""
+    return f"{answer}\nConfidence: {confidence}."
+
+
+def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str:
+    q = normalize_query(prompt)
+    if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")):
+        return ""
+    entry = _select_workload(prompt, workloads)
+    if not entry:
+        return ""
+    workload = entry.get("workload") or ""
+    namespace = entry.get("namespace") or ""
+    nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {}
+    primary = entry.get("primary_node") or ""
+    if not workload or not nodes:
+        return ""
+    parts = []
+    if primary:
+        parts.append(f"{primary} (primary)")
+    for node, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0])):
+        if node == primary:
+            continue
+        parts.append(f"{node} ({count} pod{'s' if count != 1 else ''})")
+    node_text = ", ".join(parts) if parts else primary
+    answer = f"{workload} runs in {namespace}. Nodes: {node_text}."
+    return _format_confidence(answer, "medium")
+
+
+def _snapshot_metrics(snapshot: dict[str, Any] | None) -> dict[str, Any]:
+    if not snapshot:
+        return {}
+    metrics = snapshot.get("metrics")
+    return metrics if isinstance(metrics, dict) else {}
+
+
+def _node_usage_top(
+    usage: list[dict[str, Any]],
+    *,
+    allowed_nodes: set[str] | None,
+) -> tuple[str, float] | None:
+    best_node = ""
+    best_val = None
+    for item in usage if isinstance(usage, list) else []:
+        if not isinstance(item, dict):
+            continue
+        node = item.get("node") or ""
+        if allowed_nodes and node not in allowed_nodes:
+            continue
+        value = item.get("value")
+        try:
+            numeric = float(value)
+        except (TypeError, ValueError):
+            continue
+        if best_val is None or numeric > best_val:
+            best_val = numeric
+            best_node = node
+    if best_node and best_val is not None:
+        return best_node, best_val
+    return None
+
+
+def snapshot_metric_answer(
+    prompt: str,
+    *,
+    snapshot: dict[str, Any] | None,
+    inventory: list[dict[str, Any]],
+) -> str:
+    if not snapshot:
+        return ""
+    metrics = _snapshot_metrics(snapshot)
+    if not metrics:
+        return ""
+    q = normalize_query(prompt)
+    metric = _detect_metric(q)
+    op = _detect_operation(q)
+    include_hw, exclude_hw = _detect_hardware_filters(q)
+    nodes_in_query = _extract_titan_nodes(q)
+    only_workers = "worker" in q or "workers" in q
+
+    filtered = _inventory_filter(
+        inventory,
+        include_hw=include_hw,
+        exclude_hw=exclude_hw,
+        only_workers=only_workers,
+        only_ready=None,
+        nodes_in_query=nodes_in_query,
+    )
+    allowed_nodes = {node["name"] for node in filtered} if filtered else None
+
+    if metric in {"cpu", "ram", "net", "io"} and op in {"top", "status", None}:
+        usage = metrics.get("node_usage", {}).get(metric, [])
+        top = _node_usage_top(usage, allowed_nodes=allowed_nodes)
+        if top:
+            node, val = top
+            percent = metric in {"cpu", "ram"}
+            value = _format_metric_value(str(val), percent=percent)
+            scope = ""
+            if include_hw:
+                scope = f" among {' and '.join(sorted(include_hw))}"
+            answer = f"Hottest node{scope}: {node} ({value})."
+            return _format_confidence(answer, "high")
+
+    if metric == "connections" or "postgres" in q:
+        postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+        used = postgres.get("used")
+        max_conn = postgres.get("max")
+        hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
+        parts: list[str] = []
+        if used is not None and max_conn is not None:
+            parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.")
+        if hottest.get("label"):
+            hot_val = hottest.get("value")
+            hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
+            parts.append(f"Hottest DB: {hottest.get('label')} ({hot_val_str}).")
+        if parts:
+            return _format_confidence(" ".join(parts), "high")
+
+    return ""
+
+def structured_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    metrics_summary: str,
+    snapshot: dict[str, Any] | None = None,
+    workloads: list[dict[str, Any]] | None = None,
+) -> str:
     q = normalize_query(prompt)
     if not q:
         return ""
 
+    if workloads:
+        workload_resp = workload_answer(prompt, workloads)
+        if workload_resp:
+            return workload_resp
+
+    snap_resp = snapshot_metric_answer(prompt, snapshot=snapshot, inventory=inventory)
+    if snap_resp:
+        return snap_resp
+
     tokens = _tokens(q)
     op = _detect_operation(q)
     metric = _detect_metric(q)
@@ -749,11 +924,20 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
     if op == "status":
         if "missing" in q and expected_workers:
             missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
-            return "Missing nodes: " + (", ".join(missing) if missing else "none") + "."
+            return _format_confidence(
+                "Missing nodes: " + (", ".join(missing) if missing else "none") + ".",
+                "high",
+            )
         if only_ready is False:
-            return "Not ready nodes: " + (", ".join(names) if names else "none") + "."
+            return _format_confidence(
+                "Not ready nodes: " + (", ".join(names) if names else "none") + ".",
+                "high",
+            )
         if only_ready is True:
-            return f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + "."
+            return _format_confidence(
+                f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + ".",
+                "high",
+            )
 
     if op == "count":
         if expected_workers and ("expected" in q or "should" in q):
@@ -761,10 +945,10 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
             msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
             if missing:
                 msg += f" Missing: {', '.join(missing)}."
-            return msg
+            return _format_confidence(msg, "high")
         if not (include_hw or exclude_hw or nodes_in_query or only_workers):
-            return f"Atlas has {len(names)} nodes."
-        return f"Matching nodes: {len(names)}."
+            return _format_confidence(f"Atlas has {len(names)} nodes.", "high")
+        return _format_confidence(f"Matching nodes: {len(names)}.", "high")
 
     if op == "list":
         if nodes_in_query:
@@ -772,12 +956,12 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
             existing = {n["name"] for n in inventory}
             for node in nodes_in_query:
                 parts.append(f"{node}: {'present' if node in existing else 'not present'}")
-            return "Node presence: " + ", ".join(parts) + "."
+            return _format_confidence("Node presence: " + ", ".join(parts) + ".", "high")
         if not names:
-            return "Matching nodes: none."
+            return _format_confidence("Matching nodes: none.", "high")
         shown = names[:30]
         suffix = f", … (+{len(names) - 30} more)" if len(names) > 30 else ""
-        return "Matching nodes: " + ", ".join(shown) + suffix + "."
+        return _format_confidence("Matching nodes: " + ", ".join(shown) + suffix + ".", "high")
 
     return ""
 
@@ -922,6 +1106,58 @@ def _ariadne_state(timeout: int = 5) -> dict | None:
     except Exception:
         return None
 
+
+_SNAPSHOT_CACHE: dict[str, Any] = {"payload": None, "ts": 0.0}
+
+
+def _snapshot_state() -> dict[str, Any] | None:
+    now = time.monotonic()
+    cached = _SNAPSHOT_CACHE.get("payload")
+    ts = _SNAPSHOT_CACHE.get("ts") or 0.0
+    if cached and now - ts < max(5, SNAPSHOT_TTL_SEC):
+        return cached
+    payload = _ariadne_state(timeout=10)
+    if isinstance(payload, dict) and payload:
+        _SNAPSHOT_CACHE["payload"] = payload
+        _SNAPSHOT_CACHE["ts"] = now
+        return payload
+    return cached if isinstance(cached, dict) else None
+
+
+def _snapshot_inventory(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]:
+    if not snapshot:
+        return []
+    items = snapshot.get("nodes_detail")
+    if not isinstance(items, list):
+        return []
+    inventory: list[dict[str, Any]] = []
+    for node in items:
+        if not isinstance(node, dict):
+            continue
+        labels = node.get("labels") if isinstance(node.get("labels"), dict) else {}
+        name = node.get("name") or ""
+        if not name:
+            continue
+        hardware = node.get("hardware") or _hardware_class(labels)
+        inventory.append(
+            {
+                "name": name,
+                "arch": node.get("arch") or labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
+                "hardware": hardware,
+                "roles": node.get("roles") or [],
+                "is_worker": node.get("is_worker") is True,
+                "ready": node.get("ready") is True,
+            }
+        )
+    return sorted(inventory, key=lambda item: item["name"])
+
+
+def _snapshot_workloads(snapshot: dict[str, Any] | None) -> list[dict[str, Any]]:
+    if not snapshot:
+        return []
+    workloads = snapshot.get("workloads")
+    return workloads if isinstance(workloads, list) else []
+
 def k8s_pods(namespace: str) -> list[dict]:
     data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500")
     items = data.get("items") or []
@@ -1079,25 +1315,11 @@ def _node_is_worker(node: dict) -> bool:
         return True
     return True
 
-def worker_nodes_status() -> tuple[list[str], list[str]]:
-    try:
-        data = k8s_get("/api/v1/nodes?limit=500")
-    except Exception:
-        return ([], [])
-    items = data.get("items") or []
-    ready_nodes: list[str] = []
-    not_ready_nodes: list[str] = []
-    for node in items if isinstance(items, list) else []:
-        if not _node_is_worker(node):
-            continue
-        name = (node.get("metadata") or {}).get("name") or ""
-        if not name:
-            continue
-        ready = _node_ready_status(node)
-        if ready is True:
-            ready_nodes.append(name)
-        elif ready is False:
-            not_ready_nodes.append(name)
+def worker_nodes_status(inventory: list[dict[str, Any]] | None = None) -> tuple[list[str], list[str]]:
+    if inventory is None:
+        inventory = node_inventory()
+    ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is True]
+    not_ready_nodes = [n["name"] for n in inventory if n.get("is_worker") and n.get("ready") is False]
     return (sorted(ready_nodes), sorted(not_ready_nodes))
 
 def expected_worker_nodes_from_metrics() -> list[str]:
@@ -1238,13 +1460,29 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         if not prompt:
             self._write_json(400, {"error": "missing_prompt"})
             return
-        inventory = node_inventory_live()
-        answer = structured_answer(prompt, inventory=inventory, metrics_summary="")
+        snapshot = _snapshot_state()
+        inventory = _snapshot_inventory(snapshot) or node_inventory_live()
+        workloads = _snapshot_workloads(snapshot)
+        answer = structured_answer(
+            prompt,
+            inventory=inventory,
+            metrics_summary="",
+            snapshot=snapshot,
+            workloads=workloads,
+        )
         if not answer and _knowledge_intent(prompt):
             answer = knowledge_summary(prompt, inventory)
         if not answer:
             kb = kb_retrieve_titles(prompt, limit=4)
-            answer = kb or ""
+            context = build_context(
+                prompt,
+                allow_tools=False,
+                targets=[],
+                inventory=inventory,
+                snapshot=snapshot,
+            )
+            fallback = kb or "I don't have enough data to answer that."
+            answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
         self._write_json(200, {"answer": answer})
 
 
@@ -1266,6 +1504,7 @@ def build_context(
     allow_tools: bool,
     targets: list[tuple[str, str]],
     inventory: list[dict[str, Any]] | None = None,
+    snapshot: dict[str, Any] | None = None,
 ) -> str:
     parts: list[str] = []
 
@@ -1281,6 +1520,10 @@ def build_context(
     if node_ctx:
         parts.append(node_ctx)
 
+    snapshot_ctx = snapshot_context(prompt, snapshot)
+    if snapshot_ctx:
+        parts.append(snapshot_ctx)
+
     if allow_tools:
         # Scope pod summaries to relevant namespaces/workloads when possible.
         prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set)
@@ -1311,6 +1554,33 @@ def build_context(
     return "\n\n".join([p for p in parts if p]).strip()
 
 
+def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str:
+    if not snapshot:
+        return ""
+    metrics = _snapshot_metrics(snapshot)
+    workloads = _snapshot_workloads(snapshot)
+    q = normalize_query(prompt)
+    parts: list[str] = []
+    nodes = snapshot.get("nodes") if isinstance(snapshot.get("nodes"), dict) else {}
+    if nodes.get("total") is not None:
+        parts.append(
+            f"Snapshot: nodes_total={nodes.get('total')}, ready={nodes.get('ready')}, not_ready={nodes.get('not_ready')}."
+        )
+    if any(word in q for word in ("postgres", "connections", "db")):
+        postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+        if postgres:
+            parts.append(f"Snapshot: postgres_connections={postgres}.")
+    if any(word in q for word in ("hottest", "cpu", "ram", "memory", "net", "network", "io", "disk")):
+        hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+        if hottest:
+            parts.append(f"Snapshot: hottest_nodes={hottest}.")
+    if workloads and any(word in q for word in ("run", "running", "host", "node", "where", "which")):
+        match = _select_workload(prompt, workloads)
+        if match:
+            parts.append(f"Snapshot: workload={match}.")
+    return "\n".join(parts).strip()
+
+
 def _knowledge_intent(prompt: str) -> bool:
     q = normalize_query(prompt)
     return any(
@@ -1350,7 +1620,8 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str:
     kb_titles = kb_retrieve_titles(prompt, limit=4)
     if kb_titles:
         parts.append(kb_titles)
-    return "\n".join(parts).strip()
+    summary = "\n".join(parts).strip()
+    return _format_confidence(summary, "medium") if summary else ""
 
 def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
     system = (
@@ -1360,7 +1631,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         "Never include or request secret values. "
         "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
-        "If the answer is not grounded in the provided context or tool data, say you do not know."
+        "If the answer is not grounded in the provided context or tool data, say you do not know. "
+        "End every response with a line: 'Confidence: high|medium|low'."
     )
     transcript_parts = [system]
     if context:
@@ -1491,8 +1763,18 @@ def sync_loop(token: str, room_id: str):
                             if isinstance(w, dict) and w.get("name"):
                                 targets.append((ns, str(w["name"])))
 
+                snapshot = _snapshot_state()
                 inventory = node_inventory_for_prompt(body)
-                context = build_context(body, allow_tools=allow_tools, targets=targets, inventory=inventory)
+                if not inventory:
+                    inventory = _snapshot_inventory(snapshot)
+                workloads = _snapshot_workloads(snapshot)
+                context = build_context(
+                    body,
+                    allow_tools=allow_tools,
+                    targets=targets,
+                    inventory=inventory,
+                    snapshot=snapshot,
+                )
                 if allow_tools and promql:
                     res = vm_query(promql, timeout=20)
                     rendered = vm_render_result(res, limit=15) or "(no results)"
@@ -1506,7 +1788,13 @@ def sync_loop(token: str, room_id: str):
                 if not fallback and context:
                     fallback = _context_fallback(context)
 
-                structured = structured_answer(body, inventory=inventory, metrics_summary=metrics_fallback or "")
+                structured = structured_answer(
+                    body,
+                    inventory=inventory,
+                    metrics_summary=metrics_fallback or "",
+                    snapshot=snapshot,
+                    workloads=workloads,
+                )
                 if structured:
                     send_msg(token, rid, structured)
                     continue

From 13f9fd425821a5c2416ec2118f531d12eab3408a Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 08:50:29 +0000
Subject: [PATCH 291/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 1392855b..0f8cd2a0 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-56 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-57 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From b73d4d6533ac36e49e97a7090c505517d524d52b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 09:00:40 +0000
Subject: [PATCH 292/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index 0f8cd2a0..e4580aae 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-57 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-58 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 91d4da9397cdaf283c3a26776ec95fe3b8eff65d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 06:28:03 -0300
Subject: [PATCH 293/416] atlasbot: shift to facts context and upgrade model

---
 services/ai-llm/deployment.yaml         |   4 +-
 services/comms/atlasbot-deployment.yaml |   4 +-
 services/comms/scripts/atlasbot/bot.py  | 203 +++++++++++++++++-------
 3 files changed, 151 insertions(+), 60 deletions(-)

diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml
index 43d14c81..bf012c0b 100644
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@@ -20,7 +20,7 @@ spec:
       labels:
         app: ollama
       annotations:
-        ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0
+        ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
         ai.bstein.dev/gpu: GPU pool (titan-22/24)
         ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
     spec:
@@ -52,7 +52,7 @@ spec:
             - name: OLLAMA_MODELS
               value: /root/.ollama
             - name: OLLAMA_MODEL
-              value: qwen2.5:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct-q4_0
           command:
             - /bin/sh
             - -c
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 0ee86f01..f4883c41 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-29
+        checksum/atlasbot-configmap: manual-atlasbot-30
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -82,7 +82,7 @@ spec:
             - name: OLLAMA_URL
               value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
-              value: qwen2.5:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC
               value: "600"
             - name: ATLASBOT_THINKING_INTERVAL_SEC
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 9f6c38dc..a91744dd 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -33,7 +33,10 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
 MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
+MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
 THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
+OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
+OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
 
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE)
 HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)")
@@ -113,6 +116,8 @@ METRIC_HINTS = {
     "connections": ("connections", "conn", "postgres", "database", "db"),
 }
 
+_OLLAMA_LOCK = threading.Lock()
+
 HARDWARE_HINTS = {
     "amd64": ("amd64", "x86", "x86_64", "x86-64"),
     "jetson": ("jetson",),
@@ -638,6 +643,105 @@ def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
         return node_inventory()
     return []
 
+def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
+    grouped: dict[str, list[str]] = collections.defaultdict(list)
+    for node in inventory:
+        grouped[(node.get("arch") or "unknown")].append(node["name"])
+    return {k: sorted(v) for k, v in grouped.items()}
+
+def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]:
+    usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
+    per_node: dict[str, dict[str, Any]] = {}
+    for metric_name, entries in usage.items() if isinstance(usage, dict) else []:
+        if not isinstance(entries, list):
+            continue
+        for entry in entries:
+            if not isinstance(entry, dict):
+                continue
+            node = entry.get("node")
+            if not isinstance(node, str) or not node:
+                continue
+            per_node.setdefault(node, {})[metric_name] = entry.get("value")
+    return [{"node": node, **vals} for node, vals in sorted(per_node.items())]
+
+def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]:
+    cleaned: list[dict[str, Any]] = []
+    for entry in workloads:
+        if not isinstance(entry, dict):
+            continue
+        cleaned.append(
+            {
+                "namespace": entry.get("namespace"),
+                "workload": entry.get("workload"),
+                "pods_total": entry.get("pods_total"),
+                "pods_running": entry.get("pods_running"),
+                "primary_node": entry.get("primary_node"),
+                "nodes": entry.get("nodes"),
+            }
+        )
+    cleaned.sort(
+        key=lambda item: (
+            -(item.get("pods_total") or 0),
+            str(item.get("namespace") or ""),
+            str(item.get("workload") or ""),
+        )
+    )
+    return cleaned[:limit]
+
+def facts_context(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]] | None,
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> str:
+    inv = inventory or []
+    metrics = _snapshot_metrics(snapshot)
+    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
+    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
+    expected_workers = expected_worker_nodes_from_metrics()
+    ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], [])
+
+    facts: dict[str, Any] = {
+        "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None,
+        "nodes": {
+            "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"),
+            "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"),
+            "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"),
+            "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"),
+            "by_hardware": _group_nodes(inv) if inv else {},
+            "by_arch": _nodes_by_arch(inv) if inv else {},
+            "workers_ready": ready_workers,
+            "workers_not_ready": not_ready_workers,
+            "expected_workers": expected_workers,
+        },
+        "metrics": {
+            "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {},
+            "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {},
+            "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [],
+        },
+        "workloads": _workloads_for_facts(workloads or []),
+    }
+
+    rendered = json.dumps(facts, ensure_ascii=False)
+    if len(rendered) <= MAX_FACTS_CHARS:
+        return "Facts (live snapshot):\n" + rendered
+
+    trimmed = dict(facts)
+    trimmed.pop("workloads", None)
+    rendered = json.dumps(trimmed, ensure_ascii=False)
+    if len(rendered) <= MAX_FACTS_CHARS:
+        return "Facts (live snapshot):\n" + rendered
+
+    trimmed_metrics = dict(trimmed.get("metrics") or {})
+    trimmed_metrics.pop("node_usage", None)
+    trimmed["metrics"] = trimmed_metrics
+    rendered = json.dumps(trimmed, ensure_ascii=False)
+    if len(rendered) <= MAX_FACTS_CHARS:
+        return "Facts (live snapshot):\n" + rendered
+
+    return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS]
+
 def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
     names = [node["name"] for node in inventory]
     ready = [node["name"] for node in inventory if node.get("ready") is True]
@@ -1463,26 +1567,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
-        answer = structured_answer(
+        context = build_context(
             prompt,
+            allow_tools=False,
+            targets=[],
             inventory=inventory,
-            metrics_summary="",
             snapshot=snapshot,
             workloads=workloads,
         )
-        if not answer and _knowledge_intent(prompt):
-            answer = knowledge_summary(prompt, inventory)
-        if not answer:
-            kb = kb_retrieve_titles(prompt, limit=4)
-            context = build_context(
-                prompt,
-                allow_tools=False,
-                targets=[],
-                inventory=inventory,
-                snapshot=snapshot,
-            )
-            fallback = kb or "I don't have enough data to answer that."
-            answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
+        metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True)
+        if metrics_context:
+            context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
+        fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
+        answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
         self._write_json(200, {"answer": answer})
 
 
@@ -1505,10 +1602,13 @@ def build_context(
     targets: list[tuple[str, str]],
     inventory: list[dict[str, Any]] | None = None,
     snapshot: dict[str, Any] | None = None,
+    workloads: list[dict[str, Any]] | None = None,
 ) -> str:
     parts: list[str] = []
 
     kb = kb_retrieve(prompt)
+    if not kb and _knowledge_intent(prompt):
+        kb = kb_retrieve_titles(prompt, limit=4)
     if kb:
         parts.append(kb)
 
@@ -1516,13 +1616,9 @@ def build_context(
     if endpoints:
         parts.append(endpoints)
 
-    node_ctx = node_inventory_context(prompt, inventory)
-    if node_ctx:
-        parts.append(node_ctx)
-
-    snapshot_ctx = snapshot_context(prompt, snapshot)
-    if snapshot_ctx:
-        parts.append(snapshot_ctx)
+    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if facts:
+        parts.append(facts)
 
     if allow_tools:
         # Scope pod summaries to relevant namespaces/workloads when possible.
@@ -1627,7 +1723,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
     system = (
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
         "Be helpful, direct, and concise. "
-        "Prefer answering with exact repo paths and Kubernetes resource names. "
+        "Use the provided context and facts as your source of truth. "
+        "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
+        "Prefer exact repo paths and Kubernetes resource names when relevant. "
         "Never include or request secret values. "
         "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
@@ -1646,21 +1744,32 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
     if API_KEY:
         headers["x-api-key"] = API_KEY
     r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-    with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
-        data = json.loads(resp.read().decode())
-        raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
-        reply = _normalize_reply(raw_reply) or "I'm here to help."
-    history[hist_key].append(f"Atlas: {reply}")
-    return reply
+    lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None
+    if lock:
+        lock.acquire()
+    try:
+        with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+            data = json.loads(resp.read().decode())
+            raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
+            reply = _normalize_reply(raw_reply) or "I'm here to help."
+        history[hist_key].append(f"Atlas: {reply}")
+        return reply
+    finally:
+        if lock:
+            lock.release()
 
 def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str:
-    try:
-        return _ollama_call(hist_key, prompt, context=context)
-    except Exception:
-        if fallback:
-            history[hist_key].append(f"Atlas: {fallback}")
-            return fallback
-        return "Model backend is busy. Try again in a moment."
+    last_error = None
+    for attempt in range(max(1, OLLAMA_RETRIES + 1)):
+        try:
+            return _ollama_call(hist_key, prompt, context=context)
+        except Exception as exc:  # noqa: BLE001
+            last_error = exc
+            time.sleep(min(4, 2 ** attempt))
+    if fallback:
+        history[hist_key].append(f"Atlas: {fallback}")
+        return fallback
+    return "I don't have enough data to answer that."
 
 def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str:
     result: dict[str, str] = {"reply": ""}
@@ -1774,6 +1883,7 @@ def sync_loop(token: str, room_id: str):
                     targets=targets,
                     inventory=inventory,
                     snapshot=snapshot,
+                    workloads=workloads,
                 )
                 if allow_tools and promql:
                     res = vm_query(promql, timeout=20)
@@ -1784,26 +1894,7 @@ def sync_loop(token: str, room_id: str):
                 if metrics_context:
                     context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
 
-                fallback = metrics_fallback or ""
-                if not fallback and context:
-                    fallback = _context_fallback(context)
-
-                structured = structured_answer(
-                    body,
-                    inventory=inventory,
-                    metrics_summary=metrics_fallback or "",
-                    snapshot=snapshot,
-                    workloads=workloads,
-                )
-                if structured:
-                    send_msg(token, rid, structured)
-                    continue
-
-                if _knowledge_intent(body):
-                    summary = knowledge_summary(body, inventory)
-                    if summary:
-                        send_msg(token, rid, summary)
-                        continue
+                fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
 
                 reply = ollama_reply_with_thinking(
                     token,

From 70b313ce1e8e57ef1b25e35aef4421bd142b76e6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 06:34:37 -0300
Subject: [PATCH 294/416] atlasbot: enrich facts summary for LLM

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 46 +++++++++++++++++++++----
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index f4883c41..377a076e 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-30
+        checksum/atlasbot-configmap: manual-atlasbot-31
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index a91744dd..3f055292 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -723,24 +723,55 @@ def facts_context(
         "workloads": _workloads_for_facts(workloads or []),
     }
 
+    summary_lines: list[str] = []
+    nodes_info = facts.get("nodes") if isinstance(facts.get("nodes"), dict) else {}
+    if nodes_info.get("total") is not None:
+        summary_lines.append(
+            f"nodes_total={nodes_info.get('total')}, ready={nodes_info.get('ready')}, not_ready={nodes_info.get('not_ready')}"
+        )
+    hottest = facts.get("metrics", {}).get("hottest_nodes") if isinstance(facts.get("metrics"), dict) else {}
+    if isinstance(hottest, dict) and hottest:
+        for key in ("cpu", "ram", "net", "io"):
+            entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
+            node = entry.get("node")
+            value = entry.get("value")
+            if node and value is not None:
+                summary_lines.append(f"hottest_{key}={node} ({value})")
+    postgres = facts.get("metrics", {}).get("postgres_connections") if isinstance(facts.get("metrics"), dict) else {}
+    if isinstance(postgres, dict) and postgres:
+        used = postgres.get("used")
+        max_conn = postgres.get("max")
+        if used is not None and max_conn is not None:
+            summary_lines.append(f"postgres_used={used}, postgres_max={max_conn}")
+        hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
+        if hottest_db.get("label"):
+            summary_lines.append(f"postgres_hottest_db={hottest_db.get('label')} ({hottest_db.get('value')})")
+
     rendered = json.dumps(facts, ensure_ascii=False)
-    if len(rendered) <= MAX_FACTS_CHARS:
-        return "Facts (live snapshot):\n" + rendered
+    rendered_parts = []
+    if summary_lines:
+        rendered_parts.append("Facts summary:\n" + "\n".join(f"- {line}" for line in summary_lines))
+    rendered_parts.append("Facts (live snapshot JSON):\n" + rendered)
+    combined = "\n\n".join(rendered_parts)
+    if len(combined) <= MAX_FACTS_CHARS:
+        return combined
 
     trimmed = dict(facts)
     trimmed.pop("workloads", None)
     rendered = json.dumps(trimmed, ensure_ascii=False)
-    if len(rendered) <= MAX_FACTS_CHARS:
-        return "Facts (live snapshot):\n" + rendered
+    combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered])
+    if len(combined) <= MAX_FACTS_CHARS:
+        return combined
 
     trimmed_metrics = dict(trimmed.get("metrics") or {})
     trimmed_metrics.pop("node_usage", None)
     trimmed["metrics"] = trimmed_metrics
     rendered = json.dumps(trimmed, ensure_ascii=False)
-    if len(rendered) <= MAX_FACTS_CHARS:
-        return "Facts (live snapshot):\n" + rendered
+    combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered])
+    if len(combined) <= MAX_FACTS_CHARS:
+        return combined
 
-    return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS]
+    return combined[:MAX_FACTS_CHARS]
 
 def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
     names = [node["name"] for node in inventory]
@@ -1724,6 +1755,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
         "Be helpful, direct, and concise. "
         "Use the provided context and facts as your source of truth. "
+        "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. "
         "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
         "Prefer exact repo paths and Kubernetes resource names when relevant. "
         "Never include or request secret values. "

From a8ea436fcff2c3087c5f4a7776597a1db602b5e2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 06:45:18 -0300
Subject: [PATCH 295/416] atlasbot: shrink facts context to avoid truncation

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 148 ++++++++++++++----------
 2 files changed, 89 insertions(+), 61 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 377a076e..7cb2d7da 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-31
+        checksum/atlasbot-configmap: manual-atlasbot-32
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 3f055292..9e8e0ddd 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -688,6 +688,20 @@ def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> li
     )
     return cleaned[:limit]
 
+def _workloads_for_prompt(prompt: str, workloads: list[dict[str, Any]], limit: int = 12) -> list[dict[str, Any]]:
+    tokens = set(_tokens(prompt))
+    if tokens:
+        matched: list[dict[str, Any]] = []
+        for entry in workloads:
+            if not isinstance(entry, dict):
+                continue
+            entry_tokens = _workload_tokens(entry)
+            if entry_tokens & tokens:
+                matched.append(entry)
+        if matched:
+            return _workloads_for_facts(matched, limit=limit)
+    return _workloads_for_facts(workloads, limit=limit)
+
 def facts_context(
     prompt: str,
     *,
@@ -701,77 +715,91 @@ def facts_context(
     summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
     expected_workers = expected_worker_nodes_from_metrics()
     ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], [])
+    total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
+    ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
+    not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
+    not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names")
+    by_hardware = _group_nodes(inv) if inv else {}
+    by_arch = _nodes_by_arch(inv) if inv else {}
 
-    facts: dict[str, Any] = {
-        "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None,
-        "nodes": {
-            "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"),
-            "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"),
-            "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"),
-            "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"),
-            "by_hardware": _group_nodes(inv) if inv else {},
-            "by_arch": _nodes_by_arch(inv) if inv else {},
-            "workers_ready": ready_workers,
-            "workers_not_ready": not_ready_workers,
-            "expected_workers": expected_workers,
-        },
-        "metrics": {
-            "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {},
-            "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {},
-            "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [],
-        },
-        "workloads": _workloads_for_facts(workloads or []),
-    }
-
-    summary_lines: list[str] = []
-    nodes_info = facts.get("nodes") if isinstance(facts.get("nodes"), dict) else {}
-    if nodes_info.get("total") is not None:
-        summary_lines.append(
-            f"nodes_total={nodes_info.get('total')}, ready={nodes_info.get('ready')}, not_ready={nodes_info.get('not_ready')}"
+    lines: list[str] = ["Facts (live snapshot):"]
+    if total is not None:
+        lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
+    if not_ready_names:
+        lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        nodes_list = by_hardware.get(key) or []
+        if nodes_list:
+            lines.append(f"- {key}: {', '.join(nodes_list)}")
+    for key, nodes_list in sorted(by_arch.items()):
+        if nodes_list:
+            lines.append(f"- arch {key}: {', '.join(nodes_list)}")
+    if ready_workers or not_ready_workers:
+        lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
+        if not_ready_workers:
+            lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
+    if expected_workers:
+        missing = sorted(
+            set(expected_workers)
+            - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
         )
-    hottest = facts.get("metrics", {}).get("hottest_nodes") if isinstance(facts.get("metrics"), dict) else {}
-    if isinstance(hottest, dict) and hottest:
-        for key in ("cpu", "ram", "net", "io"):
-            entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
-            node = entry.get("node")
-            value = entry.get("value")
-            if node and value is not None:
-                summary_lines.append(f"hottest_{key}={node} ({value})")
-    postgres = facts.get("metrics", {}).get("postgres_connections") if isinstance(facts.get("metrics"), dict) else {}
+        lines.append(f"- expected_workers: {', '.join(expected_workers)}")
+        if missing:
+            lines.append(f"- expected_workers_missing: {', '.join(missing)}")
+
+    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+    for key in ("cpu", "ram", "net", "io"):
+        entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
+        node = entry.get("node")
+        value = entry.get("value")
+        if node and value is not None:
+            lines.append(f"- hottest_{key}: {node} ({value})")
+
+    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
     if isinstance(postgres, dict) and postgres:
         used = postgres.get("used")
         max_conn = postgres.get("max")
         if used is not None and max_conn is not None:
-            summary_lines.append(f"postgres_used={used}, postgres_max={max_conn}")
+            lines.append(f"- postgres_connections: {used} used / {max_conn} max")
         hottest_db = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
         if hottest_db.get("label"):
-            summary_lines.append(f"postgres_hottest_db={hottest_db.get('label')} ({hottest_db.get('value')})")
+            lines.append(
+                f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
+            )
 
-    rendered = json.dumps(facts, ensure_ascii=False)
-    rendered_parts = []
-    if summary_lines:
-        rendered_parts.append("Facts summary:\n" + "\n".join(f"- {line}" for line in summary_lines))
-    rendered_parts.append("Facts (live snapshot JSON):\n" + rendered)
-    combined = "\n\n".join(rendered_parts)
-    if len(combined) <= MAX_FACTS_CHARS:
-        return combined
+    usage_table = _node_usage_table(metrics)
+    if usage_table:
+        lines.append("- node_usage (cpu/ram/net/io):")
+        for entry in usage_table:
+            node = entry.get("node")
+            if not node:
+                continue
+            cpu = entry.get("cpu")
+            ram = entry.get("ram")
+            net = entry.get("net")
+            io_val = entry.get("io")
+            lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
 
-    trimmed = dict(facts)
-    trimmed.pop("workloads", None)
-    rendered = json.dumps(trimmed, ensure_ascii=False)
-    combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered])
-    if len(combined) <= MAX_FACTS_CHARS:
-        return combined
+    workload_entries = _workloads_for_prompt(prompt, workloads or [])
+    if workload_entries:
+        lines.append("- workloads:")
+        for entry in workload_entries:
+            if not isinstance(entry, dict):
+                continue
+            ns = entry.get("namespace") or ""
+            wl = entry.get("workload") or ""
+            primary = entry.get("primary_node") or ""
+            pods_total = entry.get("pods_total")
+            label = f"{ns}/{wl}" if ns and wl else (wl or ns)
+            if not label:
+                continue
+            if primary:
+                lines.append(f"  - {label}: primary_node={primary}, pods_total={pods_total}")
+            else:
+                lines.append(f"  - {label}: pods_total={pods_total}")
 
-    trimmed_metrics = dict(trimmed.get("metrics") or {})
-    trimmed_metrics.pop("node_usage", None)
-    trimmed["metrics"] = trimmed_metrics
-    rendered = json.dumps(trimmed, ensure_ascii=False)
-    combined = "\n\n".join(rendered_parts[:-1] + ["Facts (live snapshot JSON):\n" + rendered])
-    if len(combined) <= MAX_FACTS_CHARS:
-        return combined
-
-    return combined[:MAX_FACTS_CHARS]
+    rendered = "\n".join(lines)
+    return rendered[:MAX_FACTS_CHARS]
 
 def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
     names = [node["name"] for node in inventory]

From 8f05dc9b0261a679c617ab2c3d99490d376dd85b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 11:03:55 -0300
Subject: [PATCH 296/416] atlasbot: strengthen facts context and replies

---
 services/comms/scripts/atlasbot/bot.py | 91 +++++++++++++++++++-------
 1 file changed, 68 insertions(+), 23 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 9e8e0ddd..e0056f8a 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -34,6 +34,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
 MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
 MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
+MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
 THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
 OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
 OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
@@ -100,6 +101,7 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
 TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
 TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
 _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
+CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE)
 
 OPERATION_HINTS = {
     "count": ("how many", "count", "number", "total"),
@@ -139,6 +141,20 @@ def _tokens(text: str) -> list[str]:
     return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
 
 
+def _ensure_confidence(text: str) -> str:
+    if not text:
+        return ""
+    lines = text.strip().splitlines()
+    for idx, line in enumerate(lines):
+        match = CONFIDENCE_RE.search(line)
+        if match:
+            level = match.group(1).lower()
+            lines[idx] = f"Confidence: {level}"
+            return "\n".join(lines)
+    lines.append("Confidence: medium")
+    return "\n".join(lines)
+
+
 # Mention detection (Matrix rich mentions + plain @atlas).
 MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()]
 MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS]
@@ -710,6 +726,7 @@ def facts_context(
     workloads: list[dict[str, Any]] | None,
 ) -> str:
     inv = inventory or []
+    nodes_in_query = _extract_titan_nodes(prompt)
     metrics = _snapshot_metrics(snapshot)
     nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
     summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
@@ -721,6 +738,12 @@ def facts_context(
     not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names")
     by_hardware = _group_nodes(inv) if inv else {}
     by_arch = _nodes_by_arch(inv) if inv else {}
+    control_plane_nodes = [
+        node["name"]
+        for node in inv
+        if any(role in ("control-plane", "master") for role in (node.get("roles") or []))
+    ]
+    worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True]
 
     lines: list[str] = ["Facts (live snapshot):"]
     if total is not None:
@@ -731,9 +754,16 @@ def facts_context(
         nodes_list = by_hardware.get(key) or []
         if nodes_list:
             lines.append(f"- {key}: {', '.join(nodes_list)}")
+    non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", [])))
+    if non_rpi:
+        lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}")
     for key, nodes_list in sorted(by_arch.items()):
         if nodes_list:
             lines.append(f"- arch {key}: {', '.join(nodes_list)}")
+    if control_plane_nodes:
+        lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}")
+    if worker_nodes:
+        lines.append(f"- worker_nodes: {', '.join(worker_nodes)}")
     if ready_workers or not_ready_workers:
         lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
         if not_ready_workers:
@@ -753,7 +783,8 @@ def facts_context(
         node = entry.get("node")
         value = entry.get("value")
         if node and value is not None:
-            lines.append(f"- hottest_{key}: {node} ({value})")
+            value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram"))
+            lines.append(f"- hottest_{key}: {node} ({value_fmt})")
 
     postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
     if isinstance(postgres, dict) and postgres:
@@ -774,12 +805,25 @@ def facts_context(
             node = entry.get("node")
             if not node:
                 continue
-            cpu = entry.get("cpu")
-            ram = entry.get("ram")
-            net = entry.get("net")
-            io_val = entry.get("io")
+            cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
+            ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
+            net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else ""
+            io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else ""
             lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
 
+    if nodes_in_query:
+        lines.append("- node_details:")
+        for name in nodes_in_query:
+            detail = next((n for n in inv if n.get("name") == name), None)
+            if not detail:
+                lines.append(f"  - {name}: not found in snapshot")
+                continue
+            roles = ",".join(detail.get("roles") or []) or "none"
+            lines.append(
+                f"  - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, "
+                f"ready={detail.get('ready')}, roles={roles}"
+            )
+
     workload_entries = _workloads_for_prompt(prompt, workloads or [])
     if workload_entries:
         lines.append("- workloads:")
@@ -1181,11 +1225,10 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
         if rendered:
             rendered_parts.append(rendered)
     if not rendered_parts:
-        return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
+        return "", ""
     summary = "\n".join(rendered_parts)
     context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
-    fallback = _metrics_fallback_summary(panel, summary)
-    return context, fallback
+    return context, ""
 
 def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
     q = (query or "").strip()
@@ -1574,8 +1617,8 @@ def _normalize_reply(value: Any) -> str:
         try:
             return _normalize_reply(json.loads(text))
         except Exception:
-            return text
-    return text
+            return _ensure_confidence(text)
+    return _ensure_confidence(text)
 
 
 # Internal HTTP endpoint for cluster answers (website uses this).
@@ -1634,10 +1677,10 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             snapshot=snapshot,
             workloads=workloads,
         )
-        metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True)
+        metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True)
         if metrics_context:
             context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
-        fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
+        fallback = "I don't have enough data to answer that."
         answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
         self._write_json(200, {"answer": answer})
 
@@ -1665,19 +1708,19 @@ def build_context(
 ) -> str:
     parts: list[str] = []
 
-    kb = kb_retrieve(prompt)
-    if not kb and _knowledge_intent(prompt):
-        kb = kb_retrieve_titles(prompt, limit=4)
-    if kb:
-        parts.append(kb)
+    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if facts:
+        parts.append(facts)
 
     endpoints, edges = catalog_hints(prompt)
     if endpoints:
         parts.append(endpoints)
 
-    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
-    if facts:
-        parts.append(facts)
+    kb = kb_retrieve(prompt)
+    if not kb and _knowledge_intent(prompt):
+        kb = kb_retrieve_titles(prompt, limit=4)
+    if kb:
+        parts.append(kb)
 
     if allow_tools:
         # Scope pod summaries to relevant namespaces/workloads when possible.
@@ -1789,12 +1832,14 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         "Never include or request secret values. "
         "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
+        "Translate metrics into natural language instead of echoing raw label/value pairs. "
+        "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. "
         "If the answer is not grounded in the provided context or tool data, say you do not know. "
         "End every response with a line: 'Confidence: high|medium|low'."
     )
     transcript_parts = [system]
     if context:
-        transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
+        transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS])
     transcript_parts.extend(history[hist_key][-24:])
     transcript_parts.append(f"User: {prompt}")
     transcript = "\n".join(transcript_parts)
@@ -1950,11 +1995,11 @@ def sync_loop(token: str, room_id: str):
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     context = (context + "\n\n" + extra).strip() if context else extra
-                metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
+                metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
                 if metrics_context:
                     context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
 
-                fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
+                fallback = "I don't have enough data to answer that."
 
                 reply = ollama_reply_with_thinking(
                     token,

From 677230ebeb5fc1d1c6f58c755cccb60f40a48803 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 11:05:30 -0300
Subject: [PATCH 297/416] comms: bump atlasbot configmap checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7cb2d7da..93b5108f 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-32
+        checksum/atlasbot-configmap: manual-atlasbot-33
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 3efbe161ac98c62bd8a5d7c6c382e47044c8d64d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 12:20:50 -0300
Subject: [PATCH 298/416] comms: point atlasbot to ollama and raise gateway
 memory

---
 services/bstein-dev-home/chat-ai-gateway-deployment.yaml | 4 ++--
 services/comms/atlasbot-deployment.yaml                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
index 7209da62..e5724067 100644
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@@ -67,10 +67,10 @@ spec:
           resources:
             requests:
               cpu: 20m
-              memory: 64Mi
+              memory: 128Mi
             limits:
               cpu: 200m
-              memory: 256Mi
+              memory: 512Mi
           volumeMounts:
             - name: code
               mountPath: /app/gateway.py
diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 93b5108f..d41f97cf 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -80,7 +80,7 @@ spec:
             - name: BOT_MENTIONS
               value: atlasbot,aatlasbot
             - name: OLLAMA_URL
-              value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
+              value: http://ollama.ai.svc.cluster.local:11434/
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC

From 9e1b2997ce94a2461018a24f731136c4f27e190c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 12:23:05 -0300
Subject: [PATCH 299/416] comms: restore atlasbot gateway URL

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d41f97cf..93b5108f 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -80,7 +80,7 @@ spec:
             - name: BOT_MENTIONS
               value: atlasbot,aatlasbot
             - name: OLLAMA_URL
-              value: http://ollama.ai.svc.cluster.local:11434/
+              value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC

From 612f71c5c4e75fac897e67cfd17c9e26533e510d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 12:33:56 -0300
Subject: [PATCH 300/416] atlasbot: call ollama chat directly

---
 services/comms/atlasbot-deployment.yaml |  4 +-
 services/comms/scripts/atlasbot/bot.py  | 55 +++++++++++++++++++++----
 2 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 93b5108f..7ec373fd 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-33
+        checksum/atlasbot-configmap: manual-atlasbot-34
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -80,7 +80,7 @@ spec:
             - name: BOT_MENTIONS
               value: atlasbot,aatlasbot
             - name: OLLAMA_URL
-              value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
+              value: http://ollama.ai.svc.cluster.local:11434
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e0056f8a..6644afb7 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -155,6 +155,37 @@ def _ensure_confidence(text: str) -> str:
     return "\n".join(lines)
 
 
+def _ollama_endpoint() -> str:
+    url = (OLLAMA_URL or "").strip()
+    if not url:
+        return ""
+    if url.endswith("/api/chat"):
+        return url
+    return url.rstrip("/") + "/api/chat"
+
+
+def _history_to_messages(lines: list[str]) -> list[dict[str, str]]:
+    messages: list[dict[str, str]] = []
+    for line in lines:
+        raw = (line or "").strip()
+        if not raw:
+            continue
+        role = "user"
+        content = raw
+        lowered = raw.lower()
+        if lowered.startswith("atlas:"):
+            role = "assistant"
+            content = raw.split(":", 1)[1].strip()
+        elif lowered.startswith("user:"):
+            role = "user"
+            content = raw.split(":", 1)[1].strip()
+        elif ":" in raw:
+            content = raw.split(":", 1)[1].strip()
+        if content:
+            messages.append({"role": role, "content": content})
+    return messages
+
+
 # Mention detection (Matrix rich mentions + plain @atlas).
 MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()]
 MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS]
@@ -1837,25 +1868,33 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         "If the answer is not grounded in the provided context or tool data, say you do not know. "
         "End every response with a line: 'Confidence: high|medium|low'."
     )
-    transcript_parts = [system]
+    endpoint = _ollama_endpoint()
+    if not endpoint:
+        raise RuntimeError("ollama endpoint missing")
+    system_content = system
     if context:
-        transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS])
-    transcript_parts.extend(history[hist_key][-24:])
-    transcript_parts.append(f"User: {prompt}")
-    transcript = "\n".join(transcript_parts)
+        system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS]
 
-    payload = {"model": MODEL, "message": transcript}
+    messages: list[dict[str, str]] = [{"role": "system", "content": system_content}]
+    messages.extend(_history_to_messages(history[hist_key][-24:]))
+    messages.append({"role": "user", "content": prompt})
+
+    payload = {"model": MODEL, "messages": messages, "stream": False}
     headers = {"Content-Type": "application/json"}
     if API_KEY:
         headers["x-api-key"] = API_KEY
-    r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
+    r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
     lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None
     if lock:
         lock.acquire()
     try:
         with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
             data = json.loads(resp.read().decode())
-            raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
+            msg = data.get("message") if isinstance(data, dict) else None
+            if isinstance(msg, dict):
+                raw_reply = msg.get("content")
+            else:
+                raw_reply = data.get("response") or data.get("reply") or data
             reply = _normalize_reply(raw_reply) or "I'm here to help."
         history[hist_key].append(f"Atlas: {reply}")
         return reply

From b7aa47d15cef6801ca8988643413dce833a91d58 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 12:47:28 -0300
Subject: [PATCH 301/416] atlasbot: preserve response text with confidence

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7ec373fd..b3e617d5 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-34
+        checksum/atlasbot-configmap: manual-atlasbot-35
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 6644afb7..c790f5c5 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -149,7 +149,7 @@ def _ensure_confidence(text: str) -> str:
         match = CONFIDENCE_RE.search(line)
         if match:
             level = match.group(1).lower()
-            lines[idx] = f"Confidence: {level}"
+            lines[idx] = CONFIDENCE_RE.sub(f"Confidence: {level}", line)
             return "\n".join(lines)
     lines.append("Confidence: medium")
     return "\n".join(lines)

From 2edbef8774026530555e9779f7a4f4bc819632d8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 12:53:17 -0300
Subject: [PATCH 302/416] atlasbot: enrich snapshot facts and pod metrics

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 56 ++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index b3e617d5..fd2f3992 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-35
+        checksum/atlasbot-configmap: manual-atlasbot-36
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index c790f5c5..03306204 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -95,6 +95,8 @@ METRIC_HINT_WORDS = {
     "pending",
     "unreachable",
     "latency",
+    "pod",
+    "pods",
 }
 
 CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
@@ -116,6 +118,7 @@ METRIC_HINTS = {
     "net": ("net", "network", "bandwidth", "throughput"),
     "io": ("io", "disk", "storage"),
     "connections": ("connections", "conn", "postgres", "database", "db"),
+    "pods": ("pods", "pod"),
 }
 
 _OLLAMA_LOCK = threading.Lock()
@@ -488,13 +491,15 @@ def _metric_expr_uses_percent(entry: dict[str, Any]) -> bool:
     return "* 100" in expr or "*100" in expr
 
 
-def _format_metric_value(value: str, *, percent: bool) -> str:
+def _format_metric_value(value: str, *, percent: bool, rate: bool = False) -> str:
     try:
         num = float(value)
     except (TypeError, ValueError):
         return value
     if percent:
         return f"{num:.1f}%"
+    if rate:
+        return _humanize_rate(value, unit="rate")
     if abs(num) >= 1:
         return f"{num:.2f}".rstrip("0").rstrip(".")
     return f"{num:.4f}".rstrip("0").rstrip(".")
@@ -779,6 +784,11 @@ def facts_context(
     lines: list[str] = ["Facts (live snapshot):"]
     if total is not None:
         lines.append(f"- nodes_total={total}, ready={ready}, not_ready={not_ready}")
+    if isinstance(summary, dict):
+        by_arch_counts = summary.get("by_arch")
+        if isinstance(by_arch_counts, dict) and by_arch_counts:
+            parts = [f"{arch}={count}" for arch, count in sorted(by_arch_counts.items())]
+            lines.append(f"- nodes_by_arch: {', '.join(parts)}")
     if not_ready_names:
         lines.append(f"- nodes_not_ready: {', '.join(not_ready_names)}")
     for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
@@ -799,7 +809,7 @@ def facts_context(
         lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
         if not_ready_workers:
             lines.append(f"- workers_not_ready: {', '.join(not_ready_workers)}")
-    if expected_workers:
+    if expected_workers and any(word in normalize_query(prompt) for word in ("missing", "expected", "should", "not ready", "unready")):
         missing = sorted(
             set(expected_workers)
             - {n.get("name") for n in inv if isinstance(n, dict) and n.get("name")}
@@ -814,7 +824,11 @@ def facts_context(
         node = entry.get("node")
         value = entry.get("value")
         if node and value is not None:
-            value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram"))
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=key in ("cpu", "ram"),
+                rate=key in ("net", "io"),
+            )
             lines.append(f"- hottest_{key}: {node} ({value_fmt})")
 
     postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
@@ -829,6 +843,11 @@ def facts_context(
                 f"- postgres_hottest_db: {hottest_db.get('label')} ({hottest_db.get('value')})"
             )
 
+    for key in ("pods_running", "pods_pending", "pods_failed", "pods_succeeded"):
+        value = metrics.get(key)
+        if value is not None:
+            lines.append(f"- {key}: {value}")
+
     usage_table = _node_usage_table(metrics)
     if usage_table:
         lines.append("- node_usage (cpu/ram/net/io):")
@@ -838,8 +857,16 @@ def facts_context(
                 continue
             cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
             ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
-            net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else ""
-            io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else ""
+            net = (
+                _format_metric_value(str(entry.get("net")), percent=False, rate=True)
+                if entry.get("net") is not None
+                else ""
+            )
+            io_val = (
+                _format_metric_value(str(entry.get("io")), percent=False, rate=True)
+                if entry.get("io") is not None
+                else ""
+            )
             lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
 
     if nodes_in_query:
@@ -1029,7 +1056,7 @@ def snapshot_metric_answer(
         if top:
             node, val = top
             percent = metric in {"cpu", "ram"}
-            value = _format_metric_value(str(val), percent=percent)
+            value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"})
             scope = ""
             if include_hw:
                 scope = f" among {' and '.join(sorted(include_hw))}"
@@ -1051,6 +1078,23 @@ def snapshot_metric_answer(
         if parts:
             return _format_confidence(" ".join(parts), "high")
 
+    if metric == "pods":
+        running = metrics.get("pods_running")
+        pending = metrics.get("pods_pending")
+        failed = metrics.get("pods_failed")
+        succeeded = metrics.get("pods_succeeded")
+        parts = []
+        if running is not None:
+            parts.append(f"running {running:.0f}")
+        if pending is not None:
+            parts.append(f"pending {pending:.0f}")
+        if failed is not None:
+            parts.append(f"failed {failed:.0f}")
+        if succeeded is not None:
+            parts.append(f"succeeded {succeeded:.0f}")
+        if parts:
+            return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
+
     return ""
 
 def structured_answer(

From fb6d3b515ce3c80d063fc34bd5ab412ec6ea0e80 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 12:59:11 -0300
Subject: [PATCH 303/416] atlasbot: use structured answers before LLM

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index fd2f3992..7fdbf649 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-36
+        checksum/atlasbot-configmap: manual-atlasbot-37
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 03306204..ff528ea0 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1744,6 +1744,17 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
+        metrics_summary = snapshot_context(prompt, snapshot)
+        structured = structured_answer(
+            prompt,
+            inventory=inventory,
+            metrics_summary=metrics_summary,
+            snapshot=snapshot,
+            workloads=workloads,
+        )
+        if structured:
+            self._write_json(200, {"answer": structured})
+            return
         context = build_context(
             prompt,
             allow_tools=False,
@@ -2065,6 +2076,19 @@ def sync_loop(token: str, room_id: str):
                 if not inventory:
                     inventory = _snapshot_inventory(snapshot)
                 workloads = _snapshot_workloads(snapshot)
+                metrics_summary = snapshot_context(body, snapshot)
+                structured = structured_answer(
+                    body,
+                    inventory=inventory,
+                    metrics_summary=metrics_summary,
+                    snapshot=snapshot,
+                    workloads=workloads,
+                )
+                if structured:
+                    history[hist_key].append(f"Atlas: {structured}")
+                    history[hist_key] = history[hist_key][-80:]
+                    send_msg(token, rid, structured)
+                    continue
                 context = build_context(
                     body,
                     allow_tools=allow_tools,

From 373eb64c0d59f7a76410e70f9e0dbf2fa50ebeec Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 13:02:23 -0300
Subject: [PATCH 304/416] atlasbot: refine role and hardware filters

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7fdbf649..ce53f8cb 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-37
+        checksum/atlasbot-configmap: manual-atlasbot-38
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index ff528ea0..a7741cda 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -432,7 +432,10 @@ def _detect_metric(q: str) -> str | None:
 def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
     include: set[str] = set()
     exclude: set[str] = set()
+    rpi_specific = "rpi4" in q or "rpi5" in q
     for hardware, phrases in HARDWARE_HINTS.items():
+        if hardware == "rpi" and rpi_specific:
+            continue
         for phrase in phrases:
             if f"non {phrase}" in q or f"non-{phrase}" in q or f"not {phrase}" in q:
                 exclude.add(hardware)
@@ -440,6 +443,17 @@ def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
                 include.add(hardware)
     return include, exclude
 
+
+def _detect_role_filters(q: str) -> set[str]:
+    roles: set[str] = set()
+    if "control-plane" in q or "control plane" in q:
+        roles.add("control-plane")
+    if "master" in q:
+        roles.add("master")
+    if "accelerator" in q:
+        roles.add("accelerator")
+    return roles
+
 def _detect_entity(q: str) -> str | None:
     if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
         return "node"
@@ -1125,6 +1139,7 @@ def structured_answer(
     include_hw, exclude_hw = _detect_hardware_filters(q)
     nodes_in_query = _extract_titan_nodes(q)
     only_workers = "worker" in q or "workers" in q
+    role_filters = _detect_role_filters(q)
     only_ready: bool | None = None
     if "not ready" in q or "unready" in q or "down" in q or "missing" in q:
         only_ready = False
@@ -1201,6 +1216,12 @@ def structured_answer(
         only_ready=only_ready if op in ("status", "count") else None,
         nodes_in_query=nodes_in_query,
     )
+    if role_filters:
+        filtered = [
+            node
+            for node in filtered
+            if role_filters.intersection(set(node.get("roles") or []))
+        ]
     names = [node["name"] for node in filtered]
 
     if op == "status":

From 2c26ec4a6fd520b2977243622e00ab2ccf75d1a9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 13:13:20 -0300
Subject: [PATCH 305/416] atlasbot: fix metric detection and role counts

---
 services/comms/scripts/atlasbot/bot.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index a7741cda..739019c2 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -424,9 +424,14 @@ def _detect_operation(q: str) -> str | None:
     return None
 
 def _detect_metric(q: str) -> str | None:
+    tokens = set(_tokens(q))
     for metric, phrases in METRIC_HINTS.items():
-        if _has_any(q, phrases):
-            return metric
+        for phrase in phrases:
+            if " " in phrase:
+                if phrase in q:
+                    return metric
+            elif phrase in tokens:
+                return metric
     return None
 
 def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
@@ -1249,7 +1254,7 @@ def structured_answer(
             if missing:
                 msg += f" Missing: {', '.join(missing)}."
             return _format_confidence(msg, "high")
-        if not (include_hw or exclude_hw or nodes_in_query or only_workers):
+        if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters):
             return _format_confidence(f"Atlas has {len(names)} nodes.", "high")
         return _format_confidence(f"Matching nodes: {len(names)}.", "high")
 

From a8f12ac94348817fb2f8476e0607d38cf8494917 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 13:15:13 -0300
Subject: [PATCH 306/416] comms: roll atlasbot after script update

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index ce53f8cb..4e793476 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-38
+        checksum/atlasbot-configmap: manual-atlasbot-39
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From f8a4febea9a816a5f75324480ab928fd8512c197 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 13:17:33 -0300
Subject: [PATCH 307/416] atlasbot: refine ready/pod counts

---
 services/comms/scripts/atlasbot/bot.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 739019c2..f7cfd824 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1102,6 +1102,15 @@ def snapshot_metric_answer(
         pending = metrics.get("pods_pending")
         failed = metrics.get("pods_failed")
         succeeded = metrics.get("pods_succeeded")
+        if "pending" in q and pending is not None:
+            return _format_confidence(f"Pending pods: {pending:.0f}.", "high")
+        if "failed" in q and failed is not None:
+            return _format_confidence(f"Failed pods: {failed:.0f}.", "high")
+        if "succeeded" in q or "completed" in q:
+            if succeeded is not None:
+                return _format_confidence(f"Succeeded pods: {succeeded:.0f}.", "high")
+        if "running" in q and running is not None:
+            return _format_confidence(f"Running pods: {running:.0f}.", "high")
         parts = []
         if running is not None:
             parts.append(f"running {running:.0f}")
@@ -1254,6 +1263,10 @@ def structured_answer(
             if missing:
                 msg += f" Missing: {', '.join(missing)}."
             return _format_confidence(msg, "high")
+        if only_ready is True:
+            return _format_confidence(f"Ready nodes: {len(names)}.", "high")
+        if only_ready is False:
+            return _format_confidence(f"Not ready nodes: {len(names)}.", "high")
         if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters):
             return _format_confidence(f"Atlas has {len(names)} nodes.", "high")
         return _format_confidence(f"Matching nodes: {len(names)}.", "high")

From 6351bfcdedd9183e6cb60f2615f2c83dd7cc6ee4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 13:18:01 -0300
Subject: [PATCH 308/416] comms: roll atlasbot after answer tweaks

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 4e793476..9af766dc 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-39
+        checksum/atlasbot-configmap: manual-atlasbot-40
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 369c0d27c51b9d315c9b16a07d52d060c60bdad7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 14:09:23 -0300
Subject: [PATCH 309/416] portal: allow longer atlasbot responses

---
 services/bstein-dev-home/backend-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml
index 26c99e11..ba7d6f80 100644
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@@ -70,7 +70,7 @@ spec:
             - name: AI_ATLASBOT_ENDPOINT
               value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
             - name: AI_ATLASBOT_TIMEOUT_SEC
-              value: "5"
+              value: "30"
             - name: AI_NODE_NAME
               valueFrom:
                 fieldRef:

From 9e4a5b7e6b6b11f1303d4cfee8a1ad00480693f8 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 17:12:03 +0000
Subject: [PATCH 310/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index a520991b..563b920e 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From f749e5f1f8ae2430071314eae2d78791e2d67cdd Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 17:12:07 +0000
Subject: [PATCH 311/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 563b920e..66d41e30 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-159 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From 41890e06ab702bb70f94141704d0ec21dea2fcad Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 14:38:05 -0300
Subject: [PATCH 312/416] atlasbot: ignore mentions and gate cluster context

---
 services/comms/scripts/atlasbot/bot.py | 193 +++++++++++++++++++------
 1 file changed, 146 insertions(+), 47 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index f7cfd824..26fe7efc 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -121,6 +121,49 @@ METRIC_HINTS = {
     "pods": ("pods", "pod"),
 }
 
+CLUSTER_HINT_WORDS = {
+    "atlas",
+    "titan",
+    "cluster",
+    "k8s",
+    "kubernetes",
+    "node",
+    "nodes",
+    "pod",
+    "pods",
+    "namespace",
+    "service",
+    "deployment",
+    "daemonset",
+    "statefulset",
+    "grafana",
+    "victoria",
+    "prometheus",
+    "ariadne",
+    "mailu",
+    "nextcloud",
+    "vaultwarden",
+    "firefly",
+    "wger",
+    "jellyfin",
+    "planka",
+    "budget",
+    "element",
+    "synapse",
+    "mas",
+    "comms",
+    "longhorn",
+    "harbor",
+    "jenkins",
+    "gitea",
+    "flux",
+    "keycloak",
+    "postgres",
+    "database",
+    "db",
+    "atlasbot",
+}
+
 _OLLAMA_LOCK = threading.Lock()
 
 HARDWARE_HINTS = {
@@ -231,6 +274,18 @@ def is_mentioned(content: dict, body: str) -> bool:
         return False
     return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids)
 
+def _strip_bot_mention(text: str) -> str:
+    if not text:
+        return ""
+    if not MENTION_LOCALPARTS:
+        return text.strip()
+    names = [re.escape(name) for name in MENTION_LOCALPARTS if name]
+    if not names:
+        return text.strip()
+    pattern = r"^(?:\s*@?(?:" + "|".join(names) + r")(?::)?\s+)+"
+    cleaned = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()
+    return cleaned or text.strip()
+
 
 # Matrix HTTP helper.
 def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
@@ -1780,33 +1835,38 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         if not prompt:
             self._write_json(400, {"error": "missing_prompt"})
             return
+        cleaned = _strip_bot_mention(prompt)
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
-        metrics_summary = snapshot_context(prompt, snapshot)
-        structured = structured_answer(
-            prompt,
-            inventory=inventory,
-            metrics_summary=metrics_summary,
-            snapshot=snapshot,
-            workloads=workloads,
-        )
-        if structured:
-            self._write_json(200, {"answer": structured})
-            return
-        context = build_context(
-            prompt,
-            allow_tools=False,
-            targets=[],
-            inventory=inventory,
-            snapshot=snapshot,
-            workloads=workloads,
-        )
-        metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True)
-        if metrics_context:
-            context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
+        cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
+        metrics_summary = snapshot_context(cleaned, snapshot) if cluster_query else ""
+        if cluster_query:
+            structured = structured_answer(
+                cleaned,
+                inventory=inventory,
+                metrics_summary=metrics_summary,
+                snapshot=snapshot,
+                workloads=workloads,
+            )
+            if structured:
+                self._write_json(200, {"answer": structured})
+                return
+        context = ""
+        if cluster_query:
+            context = build_context(
+                cleaned,
+                allow_tools=False,
+                targets=[],
+                inventory=inventory,
+                snapshot=snapshot,
+                workloads=workloads,
+            )
+            metrics_context, _metrics_fallback = metrics_query_context(cleaned, allow_tools=True)
+            if metrics_context:
+                context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
         fallback = "I don't have enough data to answer that."
-        answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
+        answer = ollama_reply(("http", "internal"), cleaned, context=context, fallback=fallback)
         self._write_json(200, {"answer": answer})
 
 
@@ -1920,6 +1980,37 @@ def _knowledge_intent(prompt: str) -> bool:
     )
 
 
+def _is_cluster_query(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> bool:
+    q = normalize_query(prompt)
+    if not q:
+        return False
+    if TITAN_NODE_RE.search(q):
+        return True
+    if any(word in q for word in CLUSTER_HINT_WORDS):
+        return True
+    for host_match in HOST_RE.finditer(q):
+        host = host_match.group(1).lower()
+        if host.endswith("bstein.dev"):
+            return True
+    tokens = set(_tokens(q))
+    if workloads:
+        for entry in workloads:
+            if not isinstance(entry, dict):
+                continue
+            if tokens & _workload_tokens(entry):
+                return True
+    if inventory:
+        names = {node.get("name") for node in inventory if isinstance(node, dict)}
+        if tokens & {n for n in names if n}:
+            return True
+    return False
+
+
 def _inventory_summary(inventory: list[dict[str, Any]]) -> str:
     if not inventory:
         return ""
@@ -1958,7 +2049,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
         "Translate metrics into natural language instead of echoing raw label/value pairs. "
-        "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. "
+        "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. "
+        "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. "
         "If the answer is not grounded in the provided context or tool data, say you do not know. "
         "End every response with a line: 'Confidence: high|medium|low'."
     )
@@ -2087,7 +2179,8 @@ def sync_loop(token: str, room_id: str):
                 if not (is_dm or mentioned):
                     continue
 
-                lower_body = body.lower()
+                cleaned_body = _strip_bot_mention(body)
+                lower_body = cleaned_body.lower()
 
                 # Only do live cluster introspection in DMs; metrics can be answered when mentioned.
                 allow_tools = is_dm
@@ -2101,7 +2194,7 @@ def sync_loop(token: str, room_id: str):
 
                 # Attempt to scope tools to the most likely workloads when hostnames are mentioned.
                 targets: list[tuple[str, str]] = []
-                for m in HOST_RE.finditer(body.lower()):
+                for m in HOST_RE.finditer(lower_body):
                     host = m.group(1).lower()
                     for ep in _HOST_INDEX.get(host, []):
                         backend = ep.get("backend") or {}
@@ -2111,39 +2204,45 @@ def sync_loop(token: str, room_id: str):
                                 targets.append((ns, str(w["name"])))
 
                 snapshot = _snapshot_state()
-                inventory = node_inventory_for_prompt(body)
+                inventory = node_inventory_for_prompt(cleaned_body)
                 if not inventory:
                     inventory = _snapshot_inventory(snapshot)
                 workloads = _snapshot_workloads(snapshot)
-                metrics_summary = snapshot_context(body, snapshot)
-                structured = structured_answer(
-                    body,
-                    inventory=inventory,
-                    metrics_summary=metrics_summary,
-                    snapshot=snapshot,
-                    workloads=workloads,
-                )
+                cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
+                metrics_summary = snapshot_context(cleaned_body, snapshot) if cluster_query else ""
+                structured = ""
+                if cluster_query:
+                    structured = structured_answer(
+                        cleaned_body,
+                        inventory=inventory,
+                        metrics_summary=metrics_summary,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                    )
                 if structured:
                     history[hist_key].append(f"Atlas: {structured}")
                     history[hist_key] = history[hist_key][-80:]
                     send_msg(token, rid, structured)
                     continue
-                context = build_context(
-                    body,
-                    allow_tools=allow_tools,
-                    targets=targets,
-                    inventory=inventory,
-                    snapshot=snapshot,
-                    workloads=workloads,
-                )
+                context = ""
+                if cluster_query:
+                    context = build_context(
+                        cleaned_body,
+                        allow_tools=allow_tools,
+                        targets=targets,
+                        inventory=inventory,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                    )
                 if allow_tools and promql:
                     res = vm_query(promql, timeout=20)
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     context = (context + "\n\n" + extra).strip() if context else extra
-                metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
-                if metrics_context:
-                    context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
+                if cluster_query:
+                    metrics_context, _metrics_fallback = metrics_query_context(cleaned_body, allow_tools=allow_metrics)
+                    if metrics_context:
+                        context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
 
                 fallback = "I don't have enough data to answer that."
 
@@ -2151,7 +2250,7 @@ def sync_loop(token: str, room_id: str):
                     token,
                     rid,
                     hist_key,
-                    body,
+                    cleaned_body,
                     context=context,
                     fallback=fallback,
                 )

From 21ff16cd7b08ec810c83f1615bb810e270bdf317 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 14:38:15 -0300
Subject: [PATCH 313/416] comms: roll atlasbot for mention stripping

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 9af766dc..aa91fdf6 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-40
+        checksum/atlasbot-configmap: manual-atlasbot-41
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 681a37a9aec86ea2169c657462c35402d6a1519c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 14:54:09 -0300
Subject: [PATCH 314/416] atlasbot: simplify cluster gating and context

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 197 ++++++++++++++++--------
 2 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index aa91fdf6..a2b0a3c2 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-41
+        checksum/atlasbot-configmap: manual-atlasbot-42
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 26fe7efc..64097dab 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -65,6 +65,16 @@ STOPWORDS = {
     "help",
     "atlas",
     "othrys",
+    "system",
+    "systems",
+    "service",
+    "services",
+    "app",
+    "apps",
+    "platform",
+    "software",
+    "tool",
+    "tools",
 }
 
 METRIC_HINT_WORDS = {
@@ -129,6 +139,8 @@ CLUSTER_HINT_WORDS = {
     "kubernetes",
     "node",
     "nodes",
+    "worker",
+    "workers",
     "pod",
     "pods",
     "namespace",
@@ -162,6 +174,11 @@ CLUSTER_HINT_WORDS = {
     "database",
     "db",
     "atlasbot",
+    "jetson",
+    "rpi",
+    "raspberry",
+    "amd64",
+    "arm64",
 }
 
 _OLLAMA_LOCK = threading.Lock()
@@ -1840,18 +1857,6 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
         cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
-        metrics_summary = snapshot_context(cleaned, snapshot) if cluster_query else ""
-        if cluster_query:
-            structured = structured_answer(
-                cleaned,
-                inventory=inventory,
-                metrics_summary=metrics_summary,
-                snapshot=snapshot,
-                workloads=workloads,
-            )
-            if structured:
-                self._write_json(200, {"answer": structured})
-                return
         context = ""
         if cluster_query:
             context = build_context(
@@ -1862,11 +1867,14 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 snapshot=snapshot,
                 workloads=workloads,
             )
-            metrics_context, _metrics_fallback = metrics_query_context(cleaned, allow_tools=True)
-            if metrics_context:
-                context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
         fallback = "I don't have enough data to answer that."
-        answer = ollama_reply(("http", "internal"), cleaned, context=context, fallback=fallback)
+        answer = ollama_reply(
+            ("http", "internal"),
+            cleaned,
+            context=context,
+            fallback=fallback,
+            use_history=False,
+        )
         self._write_json(200, {"answer": answer})
 
 
@@ -1897,6 +1905,15 @@ def build_context(
     if facts:
         parts.append(facts)
 
+    snapshot_json = snapshot_compact_context(
+        prompt,
+        snapshot,
+        inventory=inventory,
+        workloads=workloads,
+    )
+    if snapshot_json:
+        parts.append(snapshot_json)
+
     endpoints, edges = catalog_hints(prompt)
     if endpoints:
         parts.append(endpoints)
@@ -1925,15 +1942,6 @@ def build_context(
         if flux_bad:
             parts.append("Flux (not ready):\n" + flux_bad)
 
-        p_l = (prompt or "").lower()
-        if any(w in p_l for w in METRIC_HINT_WORDS):
-            restarts = vm_top_restarts(1)
-            if restarts:
-                parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
-            snap = vm_cluster_snapshot()
-            if snap:
-                parts.append("VictoriaMetrics (cluster snapshot):\n" + snap)
-
     return "\n\n".join([p for p in parts if p]).strip()
 
 
@@ -1963,6 +1971,68 @@ def snapshot_context(prompt: str, snapshot: dict[str, Any] | None) -> str:
             parts.append(f"Snapshot: workload={match}.")
     return "\n".join(parts).strip()
 
+def _compact_nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]:
+    details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
+    output: list[dict[str, Any]] = []
+    for node in details:
+        if not isinstance(node, dict):
+            continue
+        name = node.get("name")
+        if not name:
+            continue
+        output.append(
+            {
+                "name": name,
+                "ready": node.get("ready"),
+                "hardware": node.get("hardware"),
+                "arch": node.get("arch"),
+                "roles": node.get("roles"),
+                "is_worker": node.get("is_worker"),
+                "os": node.get("os"),
+                "kernel": node.get("kernel"),
+                "kubelet": node.get("kubelet"),
+                "container_runtime": node.get("container_runtime"),
+            }
+        )
+    return output
+
+def _compact_metrics(snapshot: dict[str, Any]) -> dict[str, Any]:
+    metrics = snapshot.get("metrics") if isinstance(snapshot.get("metrics"), dict) else {}
+    return {
+        "pods_running": metrics.get("pods_running"),
+        "pods_pending": metrics.get("pods_pending"),
+        "pods_failed": metrics.get("pods_failed"),
+        "pods_succeeded": metrics.get("pods_succeeded"),
+        "postgres_connections": metrics.get("postgres_connections"),
+        "hottest_nodes": metrics.get("hottest_nodes"),
+        "node_usage": metrics.get("node_usage"),
+        "top_restarts_1h": metrics.get("top_restarts_1h"),
+    }
+
+def snapshot_compact_context(
+    prompt: str,
+    snapshot: dict[str, Any] | None,
+    *,
+    inventory: list[dict[str, Any]] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> str:
+    if not snapshot:
+        return ""
+    compact = {
+        "collected_at": snapshot.get("collected_at"),
+        "nodes_summary": snapshot.get("nodes_summary"),
+        "expected_workers": expected_worker_nodes_from_metrics(),
+        "nodes_detail": _compact_nodes_detail(snapshot),
+        "workloads": _workloads_for_prompt(prompt, workloads or [], limit=40) if workloads else [],
+        "metrics": _compact_metrics(snapshot),
+        "flux": snapshot.get("flux"),
+        "errors": snapshot.get("errors"),
+    }
+    text = json.dumps(compact, ensure_ascii=False)
+    if len(text) > MAX_FACTS_CHARS:
+        text = text[: MAX_FACTS_CHARS - 3].rstrip() + "..."
+    return "Cluster snapshot (JSON):\n" + text
+
 
 def _knowledge_intent(prompt: str) -> bool:
     q = normalize_query(prompt)
@@ -1998,16 +2068,8 @@ def _is_cluster_query(
         if host.endswith("bstein.dev"):
             return True
     tokens = set(_tokens(q))
-    if workloads:
-        for entry in workloads:
-            if not isinstance(entry, dict):
-                continue
-            if tokens & _workload_tokens(entry):
-                return True
-    if inventory:
-        names = {node.get("name") for node in inventory if isinstance(node, dict)}
-        if tokens & {n for n in names if n}:
-            return True
+    if _NAME_INDEX and tokens & _NAME_INDEX:
+        return True
     return False
 
 
@@ -2037,7 +2099,7 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str:
     summary = "\n".join(parts).strip()
     return _format_confidence(summary, "medium") if summary else ""
 
-def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
+def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = True) -> str:
     system = (
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
         "Be helpful, direct, and concise. "
@@ -2062,7 +2124,8 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
         system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS]
 
     messages: list[dict[str, str]] = [{"role": "system", "content": system_content}]
-    messages.extend(_history_to_messages(history[hist_key][-24:]))
+    if use_history:
+        messages.extend(_history_to_messages(history[hist_key][-24:]))
     messages.append({"role": "user", "content": prompt})
 
     payload = {"model": MODEL, "messages": messages, "stream": False}
@@ -2082,31 +2145,55 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
             else:
                 raw_reply = data.get("response") or data.get("reply") or data
             reply = _normalize_reply(raw_reply) or "I'm here to help."
-        history[hist_key].append(f"Atlas: {reply}")
+        if use_history:
+            history[hist_key].append(f"Atlas: {reply}")
         return reply
     finally:
         if lock:
             lock.release()
 
-def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str:
+def ollama_reply(
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    fallback: str = "",
+    use_history: bool = True,
+) -> str:
     last_error = None
     for attempt in range(max(1, OLLAMA_RETRIES + 1)):
         try:
-            return _ollama_call(hist_key, prompt, context=context)
+            return _ollama_call(hist_key, prompt, context=context, use_history=use_history)
         except Exception as exc:  # noqa: BLE001
             last_error = exc
             time.sleep(min(4, 2 ** attempt))
     if fallback:
-        history[hist_key].append(f"Atlas: {fallback}")
+        if use_history:
+            history[hist_key].append(f"Atlas: {fallback}")
         return fallback
     return "I don't have enough data to answer that."
 
-def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str:
+def ollama_reply_with_thinking(
+    token: str,
+    room: str,
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    fallback: str,
+    use_history: bool = True,
+) -> str:
     result: dict[str, str] = {"reply": ""}
     done = threading.Event()
 
     def worker():
-        result["reply"] = ollama_reply(hist_key, prompt, context=context, fallback=fallback)
+        result["reply"] = ollama_reply(
+            hist_key,
+            prompt,
+            context=context,
+            fallback=fallback,
+            use_history=use_history,
+        )
         done.set()
 
     thread = threading.Thread(target=worker, daemon=True)
@@ -2182,9 +2269,8 @@ def sync_loop(token: str, room_id: str):
                 cleaned_body = _strip_bot_mention(body)
                 lower_body = cleaned_body.lower()
 
-                # Only do live cluster introspection in DMs; metrics can be answered when mentioned.
+                # Only do live cluster introspection in DMs.
                 allow_tools = is_dm
-                allow_metrics = is_dm or mentioned
 
                 promql = ""
                 if allow_tools:
@@ -2209,21 +2295,6 @@ def sync_loop(token: str, room_id: str):
                     inventory = _snapshot_inventory(snapshot)
                 workloads = _snapshot_workloads(snapshot)
                 cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
-                metrics_summary = snapshot_context(cleaned_body, snapshot) if cluster_query else ""
-                structured = ""
-                if cluster_query:
-                    structured = structured_answer(
-                        cleaned_body,
-                        inventory=inventory,
-                        metrics_summary=metrics_summary,
-                        snapshot=snapshot,
-                        workloads=workloads,
-                    )
-                if structured:
-                    history[hist_key].append(f"Atlas: {structured}")
-                    history[hist_key] = history[hist_key][-80:]
-                    send_msg(token, rid, structured)
-                    continue
                 context = ""
                 if cluster_query:
                     context = build_context(
@@ -2239,11 +2310,6 @@ def sync_loop(token: str, room_id: str):
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     context = (context + "\n\n" + extra).strip() if context else extra
-                if cluster_query:
-                    metrics_context, _metrics_fallback = metrics_query_context(cleaned_body, allow_tools=allow_metrics)
-                    if metrics_context:
-                        context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
-
                 fallback = "I don't have enough data to answer that."
 
                 reply = ollama_reply_with_thinking(
@@ -2253,6 +2319,7 @@ def sync_loop(token: str, room_id: str):
                     cleaned_body,
                     context=context,
                     fallback=fallback,
+                    use_history=cluster_query,
                 )
                 send_msg(token, rid, reply)
 

From 60195033f6741984c332a43a0378adb7ceafcc5b Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 17:58:07 +0000
Subject: [PATCH 315/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 66d41e30..04d7e825 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From c758f28be0faeaab1368ff6b247a9c587ef73590 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 17:58:11 +0000
Subject: [PATCH 316/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 04d7e825..bb9e5f09 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-160 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 62b7ea7dcbf207a5dc20bf7649ac662fcb7987cc Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:00:36 -0300
Subject: [PATCH 317/416] atlasbot: tighten cluster intent and snapshot framing

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index a2b0a3c2..d24cba2b 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-42
+        checksum/atlasbot-configmap: manual-atlasbot-43
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 64097dab..bee72e91 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2104,6 +2104,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
         "Be helpful, direct, and concise. "
         "Use the provided context and facts as your source of truth. "
+        "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. "
         "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. "
         "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
         "Prefer exact repo paths and Kubernetes resource names when relevant. "

From ef578456d0c518ce81e2b46315e2fa030c58824a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:04:10 -0300
Subject: [PATCH 318/416] atlasbot: force cluster intent in prompts

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d24cba2b..f4e7f7d1 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-43
+        checksum/atlasbot-configmap: manual-atlasbot-44
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index bee72e91..4316fe03 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1868,9 +1868,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 workloads=workloads,
             )
         fallback = "I don't have enough data to answer that."
+        llm_prompt = cleaned
+        if cluster_query:
+            llm_prompt = f"Atlas cluster question: {cleaned}"
         answer = ollama_reply(
             ("http", "internal"),
-            cleaned,
+            llm_prompt,
             context=context,
             fallback=fallback,
             use_history=False,
@@ -2313,11 +2316,14 @@ def sync_loop(token: str, room_id: str):
                     context = (context + "\n\n" + extra).strip() if context else extra
                 fallback = "I don't have enough data to answer that."
 
+                llm_prompt = cleaned_body
+                if cluster_query:
+                    llm_prompt = f"Atlas cluster question: {cleaned_body}"
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,
                     hist_key,
-                    cleaned_body,
+                    llm_prompt,
                     context=context,
                     fallback=fallback,
                     use_history=cluster_query,

From 0fca01d9a1b14f2c78e889697ff53166a74169ea Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:07:28 -0300
Subject: [PATCH 319/416] atlasbot: strengthen cluster disambiguation

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index f4e7f7d1..de50c37d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-44
+        checksum/atlasbot-configmap: manual-atlasbot-45
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 4316fe03..62304fa0 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1870,7 +1870,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         fallback = "I don't have enough data to answer that."
         llm_prompt = cleaned
         if cluster_query:
-            llm_prompt = f"Atlas cluster question: {cleaned}"
+            llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}"
         answer = ollama_reply(
             ("http", "internal"),
             llm_prompt,
@@ -2108,6 +2108,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru
         "Be helpful, direct, and concise. "
         "Use the provided context and facts as your source of truth. "
         "If the context includes a cluster snapshot, treat the question as about the Atlas/Othrys cluster even if the prompt is ambiguous. "
+        "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). "
         "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. "
         "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
         "Prefer exact repo paths and Kubernetes resource names when relevant. "
@@ -2318,7 +2319,7 @@ def sync_loop(token: str, room_id: str):
 
                 llm_prompt = cleaned_body
                 if cluster_query:
-                    llm_prompt = f"Atlas cluster question: {cleaned_body}"
+                    llm_prompt = f\"Atlas cluster question (use the cluster snapshot context): {cleaned_body}\"
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From 41d185fad3a69cf101ee9e53ac5fcc236b526382 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:10:03 -0300
Subject: [PATCH 320/416] atlasbot: fix prompt formatting

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index de50c37d..d4d66684 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-45
+        checksum/atlasbot-configmap: manual-atlasbot-46
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 62304fa0..429fa31d 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2319,7 +2319,7 @@ def sync_loop(token: str, room_id: str):
 
                 llm_prompt = cleaned_body
                 if cluster_query:
-                    llm_prompt = f\"Atlas cluster question (use the cluster snapshot context): {cleaned_body}\"
+                    llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}"
                 reply = ollama_reply_with_thinking(
                     token,
                     rid,

From 0e26d249c68b8882db74ada48d606d60798070d0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:12:47 -0300
Subject: [PATCH 321/416] atlasbot: send snapshot as explicit context

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d4d66684..47d09920 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-46
+        checksum/atlasbot-configmap: manual-atlasbot-47
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 429fa31d..351bb400 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2124,11 +2124,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru
     endpoint = _ollama_endpoint()
     if not endpoint:
         raise RuntimeError("ollama endpoint missing")
-    system_content = system
+    messages: list[dict[str, str]] = [{"role": "system", "content": system}]
     if context:
-        system_content += "\n\nContext (grounded):\n" + context[:MAX_CONTEXT_CHARS]
-
-    messages: list[dict[str, str]] = [{"role": "system", "content": system_content}]
+        messages.append({"role": "user", "content": "Context (grounded):\n" + context[:MAX_CONTEXT_CHARS]})
     if use_history:
         messages.extend(_history_to_messages(history[hist_key][-24:]))
     messages.append({"role": "user", "content": prompt})

From a8005bd13e75a16ca1595d47e14e8b58c88ddfe7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:30:43 -0300
Subject: [PATCH 322/416] atlasbot: answer cluster queries without llm

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 284 ++++++++++++++++++++++--
 2 files changed, 263 insertions(+), 23 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 47d09920..69b30e4d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-47
+        checksum/atlasbot-configmap: manual-atlasbot-48
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 351bb400..f0bf008b 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -532,7 +532,7 @@ def _detect_role_filters(q: str) -> set[str]:
     return roles
 
 def _detect_entity(q: str) -> str | None:
-    if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
+    if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q):
         return "node"
     if "pod" in q or "pods" in q:
         return "pod"
@@ -1152,6 +1152,15 @@ def snapshot_metric_answer(
             if include_hw:
                 scope = f" among {' and '.join(sorted(include_hw))}"
             answer = f"Hottest node{scope}: {node} ({value})."
+            if allowed_nodes and len(allowed_nodes) != len(inventory):
+                overall = _node_usage_top(usage, allowed_nodes=None)
+                if overall and overall[0] != node:
+                    overall_val = _format_metric_value(
+                        str(overall[1]),
+                        percent=percent,
+                        rate=metric in {"net", "io"},
+                    )
+                    answer += f" Overall hottest: {overall[0]} ({overall_val})."
             return _format_confidence(answer, "high")
 
     if metric == "connections" or "postgres" in q:
@@ -1358,6 +1367,219 @@ def structured_answer(
 
     return ""
 
+
+def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str:
+    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
+    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
+    total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
+    ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
+    not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
+    if total is None:
+        total = len(inventory)
+        ready = len([n for n in inventory if n.get("ready") is True])
+        not_ready = len([n for n in inventory if n.get("ready") is False])
+    if total is None:
+        return ""
+    return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)."
+
+
+def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
+    if not inventory:
+        return ""
+    groups = _group_nodes(inventory)
+    parts: list[str] = []
+    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
+        nodes = groups.get(key) or []
+        if nodes:
+            parts.append(f"{key}={len(nodes)}")
+    if not parts:
+        return ""
+    return "Hardware mix: " + ", ".join(parts) + "."
+
+
+def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
+    if not snapshot:
+        return ""
+    details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
+    counts: dict[str, int] = collections.Counter()
+    for node in details:
+        if not isinstance(node, dict):
+            continue
+        os_name = (node.get("os") or "").strip()
+        if os_name:
+            counts[os_name] += 1
+    if not counts:
+        return ""
+    parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))]
+    return "OS mix: " + ", ".join(parts[:5]) + "."
+
+
+def _pods_summary_line(metrics: dict[str, Any]) -> str:
+    if not metrics:
+        return ""
+    running = metrics.get("pods_running")
+    pending = metrics.get("pods_pending")
+    failed = metrics.get("pods_failed")
+    succeeded = metrics.get("pods_succeeded")
+    parts: list[str] = []
+    if running is not None:
+        parts.append(f"{running:.0f} running")
+    if pending is not None:
+        parts.append(f"{pending:.0f} pending")
+    if failed is not None:
+        parts.append(f"{failed:.0f} failed")
+    if succeeded is not None:
+        parts.append(f"{succeeded:.0f} succeeded")
+    if not parts:
+        return ""
+    return "Pods: " + ", ".join(parts) + "."
+
+
+def _postgres_summary_line(metrics: dict[str, Any]) -> str:
+    if not metrics:
+        return ""
+    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
+    if not postgres:
+        return ""
+    used = postgres.get("used")
+    max_conn = postgres.get("max")
+    hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
+    parts: list[str] = []
+    if used is not None and max_conn is not None:
+        parts.append(f"{used:.0f}/{max_conn:.0f} connections")
+    if hottest.get("label"):
+        hot_val = hottest.get("value")
+        hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
+        parts.append(f"hottest {hottest.get('label')} ({hot_val_str})")
+    if not parts:
+        return ""
+    return "Postgres: " + ", ".join(parts) + "."
+
+
+def _hottest_summary_line(metrics: dict[str, Any]) -> str:
+    if not metrics:
+        return ""
+    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+    if not hottest:
+        return ""
+    parts: list[str] = []
+    for key in ("cpu", "ram", "net", "io"):
+        entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
+        node = entry.get("node")
+        value = entry.get("value")
+        if node and value is not None:
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=key in ("cpu", "ram"),
+                rate=key in ("net", "io"),
+            )
+            parts.append(f"{key.upper()} {node} ({value_fmt})")
+    if not parts:
+        return ""
+    return "Hottest nodes: " + "; ".join(parts) + "."
+
+
+def cluster_overview_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+) -> str:
+    if not inventory and not snapshot:
+        return ""
+    q = normalize_query(prompt)
+    metrics = _snapshot_metrics(snapshot)
+    lines: list[str] = []
+
+    nodes_line = _nodes_summary_line(inventory, snapshot)
+    if nodes_line:
+        lines.append(nodes_line)
+
+    if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")):
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            lines.append(hw_line)
+        os_line = _os_mix_line(snapshot)
+        if os_line:
+            lines.append(os_line)
+
+    if any(
+        word in q
+        for word in (
+            "interesting",
+            "status",
+            "health",
+            "overview",
+            "summary",
+            "tell me",
+            "what do you know",
+            "about",
+            "pods",
+            "postgres",
+            "connections",
+            "hottest",
+            "cpu",
+            "ram",
+            "memory",
+            "net",
+            "network",
+            "io",
+            "disk",
+            "busy",
+            "load",
+            "usage",
+            "utilization",
+        )
+    ):
+        pods_line = _pods_summary_line(metrics)
+        if pods_line:
+            lines.append(pods_line)
+        hottest_line = _hottest_summary_line(metrics)
+        if hottest_line:
+            lines.append(hottest_line)
+        postgres_line = _postgres_summary_line(metrics)
+        if postgres_line:
+            lines.append(postgres_line)
+
+    if not lines:
+        return ""
+    return "Based on the snapshot, " + "\n".join(lines)
+
+
+def cluster_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> str:
+    metrics_summary = snapshot_context(prompt, snapshot)
+    structured = structured_answer(
+        prompt,
+        inventory=inventory,
+        metrics_summary=metrics_summary,
+        snapshot=snapshot,
+        workloads=workloads,
+    )
+    if structured:
+        return structured
+
+    overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
+    if overview:
+        kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else ""
+        if kb_titles:
+            overview = overview + "\n" + kb_titles
+        return _format_confidence(overview, "medium")
+
+    kb_titles = kb_retrieve_titles(prompt, limit=4)
+    if kb_titles:
+        return _format_confidence(kb_titles, "low")
+
+    if metrics_summary:
+        return _format_confidence(metrics_summary, "low")
+
+    return ""
+
 def _metric_tokens(entry: dict[str, Any]) -> str:
     parts: list[str] = []
     for key in ("panel_title", "dashboard", "description"):
@@ -1868,16 +2090,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 workloads=workloads,
             )
         fallback = "I don't have enough data to answer that."
-        llm_prompt = cleaned
         if cluster_query:
-            llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}"
-        answer = ollama_reply(
-            ("http", "internal"),
-            llm_prompt,
-            context=context,
-            fallback=fallback,
-            use_history=False,
-        )
+            answer = cluster_answer(
+                cleaned,
+                inventory=inventory,
+                snapshot=snapshot,
+                workloads=workloads,
+            )
+            if not answer:
+                answer = fallback
+        else:
+            llm_prompt = cleaned
+            answer = ollama_reply(
+                ("http", "internal"),
+                llm_prompt,
+                context=context,
+                fallback=fallback,
+                use_history=False,
+            )
         self._write_json(200, {"answer": answer})
 
 
@@ -2044,6 +2274,7 @@ def _knowledge_intent(prompt: str) -> bool:
         for phrase in (
             "what do you know",
             "tell me about",
+            "interesting",
             "overview",
             "summary",
             "describe",
@@ -2312,21 +2543,30 @@ def sync_loop(token: str, room_id: str):
                     res = vm_query(promql, timeout=20)
                     rendered = vm_render_result(res, limit=15) or "(no results)"
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
-                    context = (context + "\n\n" + extra).strip() if context else extra
+                    send_msg(token, rid, extra)
+                    continue
                 fallback = "I don't have enough data to answer that."
 
-                llm_prompt = cleaned_body
                 if cluster_query:
-                    llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}"
-                reply = ollama_reply_with_thinking(
-                    token,
-                    rid,
-                    hist_key,
-                    llm_prompt,
-                    context=context,
-                    fallback=fallback,
-                    use_history=cluster_query,
-                )
+                    reply = cluster_answer(
+                        cleaned_body,
+                        inventory=inventory,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                    )
+                    if not reply:
+                        reply = fallback
+                else:
+                    llm_prompt = cleaned_body
+                    reply = ollama_reply_with_thinking(
+                        token,
+                        rid,
+                        hist_key,
+                        llm_prompt,
+                        context=context,
+                        fallback=fallback,
+                        use_history=False,
+                    )
                 send_msg(token, rid, reply)
 
 def login_with_retry():

From 6a8731582a4aa7db30f229fc75a6b65054207de9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:36:08 -0300
Subject: [PATCH 323/416] atlasbot: return structured cluster summaries

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 69b30e4d..06856266 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-48
+        checksum/atlasbot-configmap: manual-atlasbot-49
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index f0bf008b..e936b955 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1268,7 +1268,17 @@ def structured_answer(
                     node_regex = "|".join([n["name"] for n in scoped])
                     expr = _apply_node_filter(expr, node_regex)
             res = vm_query(expr, timeout=20)
-            answer = _format_metric_answer(entry, res)
+            answer = ""
+            if op == "top" or "hottest" in (entry.get("panel_title") or "").lower():
+                node, val = _primary_series_metric(res)
+                if node and val is not None:
+                    percent = _metric_expr_uses_percent(entry)
+                    value_fmt = _format_metric_value(val or "", percent=percent)
+                    metric_label = (metric or "").upper()
+                    label = f"{metric_label} node" if metric_label else "node"
+                    answer = f"Hottest {label}: {node} ({value_fmt})."
+            if not answer:
+                answer = _format_metric_answer(entry, res)
             if answer:
                 scope_parts: list[str] = []
                 if include_hw:
@@ -1292,8 +1302,8 @@ def structured_answer(
                         percent = _metric_expr_uses_percent(entry)
                         base_val_fmt = _format_metric_value(base_val or "", percent=percent)
                         overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})."
-                    return f"Among {scope} nodes, {answer}{overall_note}"
-                return answer
+                    return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high")
+                return _format_confidence(answer, "high")
         if metrics_summary:
             return metrics_summary
 
@@ -1408,7 +1418,7 @@ def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
         os_name = (node.get("os") or "").strip()
         if os_name:
             counts[os_name] += 1
-    if not counts:
+    if not counts or (len(counts) == 1 and "linux" in counts):
         return ""
     parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))]
     return "OS mix: " + ", ".join(parts[:5]) + "."

From 8d467bc12f0ad30c38f2d213a786fe682b6cbad0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:42:31 -0300
Subject: [PATCH 324/416] atlasbot: improve workload matching and fallbacks

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 06856266..bccf752b 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-49
+        checksum/atlasbot-configmap: manual-atlasbot-50
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e936b955..34e27cf9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1031,6 +1031,12 @@ def _workload_tokens(entry: dict[str, Any]) -> set[str]:
     return tokens
 
 
+def _workload_query_target(prompt: str) -> str:
+    tokens = set(_tokens(prompt))
+    matches = sorted(tokens & _NAME_INDEX) if _NAME_INDEX else []
+    return matches[0] if matches else ""
+
+
 def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str, Any] | None:
     q_tokens = set(_tokens(prompt))
     if not q_tokens:
@@ -1041,6 +1047,12 @@ def _select_workload(prompt: str, workloads: list[dict[str, Any]]) -> dict[str,
             continue
         tokens = _workload_tokens(entry)
         score = len(tokens & q_tokens)
+        name = (entry.get("workload") or "").lower()
+        namespace = (entry.get("namespace") or "").lower()
+        if name and name in q_tokens:
+            score += 5
+        if namespace and namespace in q_tokens:
+            score += 3
         if score:
             scored.append((score, entry))
     if not scored:
@@ -1574,6 +1586,14 @@ def cluster_answer(
     if structured:
         return structured
 
+    q = normalize_query(prompt)
+    workload_target = _workload_query_target(prompt)
+    if workload_target and any(word in q for word in ("where", "run", "running", "host", "node")):
+        return _format_confidence(
+            f"I don't have workload placement data for {workload_target} in the current snapshot.",
+            "low",
+        )
+
     overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
     if overview:
         kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else ""

From d8657c551f3a17b65fcd0c1dc56f5199d8695ee7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 15:45:18 -0300
Subject: [PATCH 325/416] atlasbot: avoid namespace-only workload matches

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index bccf752b..301a4746 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-50
+        checksum/atlasbot-configmap: manual-atlasbot-51
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 34e27cf9..d36844bc 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1071,11 +1071,17 @@ def workload_answer(prompt: str, workloads: list[dict[str, Any]]) -> str:
     q = normalize_query(prompt)
     if not any(word in q for word in ("where", "which", "node", "run", "running", "host", "located")):
         return ""
+    target = _workload_query_target(prompt)
     entry = _select_workload(prompt, workloads)
     if not entry:
         return ""
     workload = entry.get("workload") or ""
     namespace = entry.get("namespace") or ""
+    if target:
+        workload_l = str(workload).lower()
+        namespace_l = str(namespace).lower()
+        if workload_l != target and namespace_l == target and "namespace" not in q and "workload" not in q:
+            return ""
     nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {}
     primary = entry.get("primary_node") or ""
     if not workload or not nodes:

From f241189fab40ac3632d981d4b50524c011ad23d3 Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 18:57:30 +0000
Subject: [PATCH 326/416] chore(maintenance): automated image update

---
 services/maintenance/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index e4580aae..a1ca5831 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -26,7 +26,7 @@ resources:
   - image-sweeper-cronjob.yaml
 images:
   - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-58 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
   - name: disable-k3s-traefik-script
     namespace: maintenance

From 0fbbbf39e9e221051bc9ac1ecf654a5ee50b79e7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 16:19:30 -0300
Subject: [PATCH 327/416] monitoring: fix jetson gpu metrics

---
 scripts/dashboards_render_atlas.py            |  9 ++++++-
 services/monitoring/dashboards/atlas-gpu.json |  2 +-
 .../monitoring/grafana-dashboard-gpu.yaml     |  2 +-
 .../jetson-tegrastats-exporter.yaml           |  2 +-
 .../scripts/jetson_tegrastats_exporter.py     | 25 +++++++++++++------
 5 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 675fec52..6ad43218 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -221,6 +221,13 @@ def jetson_gpu_util_by_node():
     return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
 
 
+def jetson_gpu_util_by_hostname():
+    return (
+        'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), '
+        '"Hostname", "$1", "node", "(.*)")'
+    )
+
+
 def jetson_gpu_requests(scope_var):
     return (
         "sum by (namespace,node) ("
@@ -2688,7 +2695,7 @@ def build_gpu_dashboard():
         timeseries_panel(
             3,
             "GPU Util by Node",
-            'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
+            f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})',
             {"h": 8, "w": 12, "x": 0, "y": 8},
             unit="percent",
             legend="{{Hostname}}",
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index 6b76a5c2..36ab9e5f 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -126,7 +126,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+          "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{Hostname}}"
         }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index 46b25cd0..bb395dbf 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -135,7 +135,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+              "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{Hostname}}"
             }
diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index 8584ebaa..00743943 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,7 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
-        monitoring.bstein.dev/restart-rev: "1"
+        monitoring.bstein.dev/restart-rev: "2"
     spec:
       serviceAccountName: default
       hostPID: true
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index c237ec5d..3858d969 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -4,7 +4,7 @@ import re
 import socketserver
 import subprocess
 import threading
-from time import time
+from time import sleep, time
 
 PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
 NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
@@ -20,6 +20,7 @@ METRICS = {
 LOCK = threading.Lock()
 
 def parse_line(line: str):
+    line = line.strip()
     updates = {}
     m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
     if m:
@@ -34,7 +35,7 @@ def parse_line(line: str):
     if m:
         updates["ram_used_mb"] = float(m.group(1))
         updates["ram_total_mb"] = float(m.group(2))
-    m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
+    m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
     if m:
         updates["power_5v_in_mw"] = float(m.group(1))
     with LOCK:
@@ -42,15 +43,23 @@ def parse_line(line: str):
         METRICS["last_scrape_ts"] = time()
 
 def run_tegrastats():
-    proc = subprocess.Popen(
-        ["/host/usr/bin/tegrastats", "--interval", "1000"],
-        stdout=subprocess.PIPE,
+    logfile = "/tmp/tegrastats.log"
+    subprocess.Popen(
+        ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile],
+        stdout=subprocess.DEVNULL,
         stderr=subprocess.STDOUT,
         text=True,
-        bufsize=1,
     )
-    for line in proc.stdout:
-        parse_line(line)
+    while not os.path.exists(logfile):
+        sleep(0.1)
+    with open(logfile, "r", encoding="utf-8", errors="ignore") as handle:
+        handle.seek(0, os.SEEK_END)
+        while True:
+            line = handle.readline()
+            if not line:
+                sleep(0.2)
+                continue
+            parse_line(line)
 
 class Handler(http.server.BaseHTTPRequestHandler):
     def do_GET(self):

From eb809524b50e822232e1f067ac327cfd1122f168 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 16:23:23 -0300
Subject: [PATCH 328/416] monitoring: refresh jetson stats on scrape

---
 .../jetson-tegrastats-exporter.yaml           |  2 +-
 .../scripts/jetson_tegrastats_exporter.py     | 37 +++++++++++--------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index 00743943..a6612c66 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,7 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
-        monitoring.bstein.dev/restart-rev: "2"
+        monitoring.bstein.dev/restart-rev: "3"
     spec:
       serviceAccountName: default
       hostPID: true
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index 3858d969..4cbf6ca3 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -4,10 +4,11 @@ import re
 import socketserver
 import subprocess
 import threading
-from time import sleep, time
+from time import time
 
 PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
 NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
+LOGFILE = "/tmp/tegrastats.log"
 METRICS = {
     "gr3d_freq_percent": 0.0,
     "gpu_temp_c": 0.0,
@@ -42,24 +43,28 @@ def parse_line(line: str):
         METRICS.update(updates)
         METRICS["last_scrape_ts"] = time()
 
-def run_tegrastats():
-    logfile = "/tmp/tegrastats.log"
+def start_tegrastats():
     subprocess.Popen(
-        ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", logfile],
+        ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", LOGFILE],
         stdout=subprocess.DEVNULL,
         stderr=subprocess.STDOUT,
         text=True,
     )
-    while not os.path.exists(logfile):
-        sleep(0.1)
-    with open(logfile, "r", encoding="utf-8", errors="ignore") as handle:
-        handle.seek(0, os.SEEK_END)
-        while True:
-            line = handle.readline()
-            if not line:
-                sleep(0.2)
-                continue
-            parse_line(line)
+
+
+def refresh_from_log():
+    if not os.path.exists(LOGFILE):
+        return
+    try:
+        with open(LOGFILE, "rb") as handle:
+            handle.seek(0, os.SEEK_END)
+            size = handle.tell()
+            handle.seek(max(size - 4096, 0), os.SEEK_SET)
+            tail = handle.read().decode("utf-8", errors="ignore").splitlines()
+            if tail:
+                parse_line(tail[-1])
+    except OSError:
+        return
 
 class Handler(http.server.BaseHTTPRequestHandler):
     def do_GET(self):
@@ -67,6 +72,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
             self.send_response(404)
             self.end_headers()
             return
+        refresh_from_log()
         with LOCK:
             metrics = METRICS.copy()
         out = []
@@ -85,7 +91,6 @@ class Handler(http.server.BaseHTTPRequestHandler):
         return
 
 if __name__ == "__main__":
-    t = threading.Thread(target=run_tegrastats, daemon=True)
-    t.start()
+    start_tegrastats()
     with socketserver.TCPServer(("", PORT), Handler) as httpd:
         httpd.serve_forever()

From 3b2029056162faea54d6ba9a33f2ba72204a5b28 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 16:27:45 -0300
Subject: [PATCH 329/416] monitoring: read jetson stats on demand

---
 .../jetson-tegrastats-exporter.yaml           |  2 +-
 .../scripts/jetson_tegrastats_exporter.py     | 27 +++++++++----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index a6612c66..d80d83eb 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,7 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
-        monitoring.bstein.dev/restart-rev: "3"
+        monitoring.bstein.dev/restart-rev: "4"
     spec:
       serviceAccountName: default
       hostPID: true
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index 4cbf6ca3..204e439c 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -3,13 +3,12 @@ import os
 import re
 import socketserver
 import subprocess
-import threading
 from time import time
 
 PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
 NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
 LOGFILE = "/tmp/tegrastats.log"
-METRICS = {
+BASE_METRICS = {
     "gr3d_freq_percent": 0.0,
     "gpu_temp_c": 0.0,
     "cpu_temp_c": 0.0,
@@ -18,9 +17,8 @@ METRICS = {
     "power_5v_in_mw": 0.0,
     "last_scrape_ts": 0.0,
 }
-LOCK = threading.Lock()
 
-def parse_line(line: str):
+def parse_line(line: str) -> dict:
     line = line.strip()
     updates = {}
     m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
@@ -39,9 +37,7 @@ def parse_line(line: str):
     m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
     if m:
         updates["power_5v_in_mw"] = float(m.group(1))
-    with LOCK:
-        METRICS.update(updates)
-        METRICS["last_scrape_ts"] = time()
+    return updates
 
 def start_tegrastats():
     subprocess.Popen(
@@ -52,19 +48,18 @@ def start_tegrastats():
     )
 
 
-def refresh_from_log():
+def read_latest_line() -> str:
     if not os.path.exists(LOGFILE):
-        return
+        return ""
     try:
         with open(LOGFILE, "rb") as handle:
             handle.seek(0, os.SEEK_END)
             size = handle.tell()
             handle.seek(max(size - 4096, 0), os.SEEK_SET)
             tail = handle.read().decode("utf-8", errors="ignore").splitlines()
-            if tail:
-                parse_line(tail[-1])
+            return tail[-1] if tail else ""
     except OSError:
-        return
+        return ""
 
 class Handler(http.server.BaseHTTPRequestHandler):
     def do_GET(self):
@@ -72,9 +67,11 @@ class Handler(http.server.BaseHTTPRequestHandler):
             self.send_response(404)
             self.end_headers()
             return
-        refresh_from_log()
-        with LOCK:
-            metrics = METRICS.copy()
+        metrics = BASE_METRICS.copy()
+        line = read_latest_line()
+        if line:
+            metrics.update(parse_line(line))
+        metrics["last_scrape_ts"] = time()
         out = []
         label = f'{{node="{NODE_NAME}"}}'
         for k, v in metrics.items():

From aacfc8f28ca6c793111eb402bafa61ab5a05d245 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 16:34:31 -0300
Subject: [PATCH 330/416] monitoring: read tegrastats per scrape

---
 .../jetson-tegrastats-exporter.yaml           |  2 +-
 .../scripts/jetson_tegrastats_exporter.py     | 32 ++++++++-----------
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index d80d83eb..36799388 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,7 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
-        monitoring.bstein.dev/restart-rev: "4"
+        monitoring.bstein.dev/restart-rev: "5"
     spec:
       serviceAccountName: default
       hostPID: true
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index 204e439c..8314ad72 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -7,7 +7,6 @@ from time import time
 
 PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
 NODE_NAME = os.environ.get("NODE_NAME") or os.uname().nodename
-LOGFILE = "/tmp/tegrastats.log"
 BASE_METRICS = {
     "gr3d_freq_percent": 0.0,
     "gpu_temp_c": 0.0,
@@ -39,25 +38,21 @@ def parse_line(line: str) -> dict:
         updates["power_5v_in_mw"] = float(m.group(1))
     return updates
 
-def start_tegrastats():
-    subprocess.Popen(
-        ["/host/usr/bin/tegrastats", "--interval", "1000", "--logfile", LOGFILE],
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.STDOUT,
-        text=True,
-    )
-
-
 def read_latest_line() -> str:
-    if not os.path.exists(LOGFILE):
-        return ""
     try:
-        with open(LOGFILE, "rb") as handle:
-            handle.seek(0, os.SEEK_END)
-            size = handle.tell()
-            handle.seek(max(size - 4096, 0), os.SEEK_SET)
-            tail = handle.read().decode("utf-8", errors="ignore").splitlines()
-            return tail[-1] if tail else ""
+        proc = subprocess.Popen(
+            ["/host/usr/bin/tegrastats", "--interval", "1000"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        line = proc.stdout.readline()
+        proc.terminate()
+        try:
+            proc.wait(timeout=1)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+        return line
     except OSError:
         return ""
 
@@ -88,6 +83,5 @@ class Handler(http.server.BaseHTTPRequestHandler):
         return
 
 if __name__ == "__main__":
-    start_tegrastats()
     with socketserver.TCPServer(("", PORT), Handler) as httpd:
         httpd.serve_forever()

From 0a64708b3d3f54b535529da9f4d95927a5b3c419 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 16:38:09 -0300
Subject: [PATCH 331/416] monitoring: expose jetson scrape line length

---
 services/monitoring/jetson-tegrastats-exporter.yaml       | 2 +-
 services/monitoring/scripts/jetson_tegrastats_exporter.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index 36799388..6b0ce376 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,7 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
-        monitoring.bstein.dev/restart-rev: "5"
+        monitoring.bstein.dev/restart-rev: "6"
     spec:
       serviceAccountName: default
       hostPID: true
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index 8314ad72..284d5ce3 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -14,6 +14,7 @@ BASE_METRICS = {
     "ram_used_mb": 0.0,
     "ram_total_mb": 0.0,
     "power_5v_in_mw": 0.0,
+    "log_line_len": 0.0,
     "last_scrape_ts": 0.0,
 }
 
@@ -33,7 +34,7 @@ def parse_line(line: str) -> dict:
     if m:
         updates["ram_used_mb"] = float(m.group(1))
         updates["ram_total_mb"] = float(m.group(2))
-    m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)/(\\d+)", line)
+    m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)(?:mW)?/(\\d+)(?:mW)?", line)
     if m:
         updates["power_5v_in_mw"] = float(m.group(1))
     return updates
@@ -66,6 +67,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
         line = read_latest_line()
         if line:
             metrics.update(parse_line(line))
+            metrics["log_line_len"] = float(len(line))
         metrics["last_scrape_ts"] = time()
         out = []
         label = f'{{node="{NODE_NAME}"}}'

From c0073b08ccaee0e14e54133914d4f58855202937 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 16:44:00 -0300
Subject: [PATCH 332/416] monitoring: fix tegrastats regexes

---
 services/monitoring/jetson-tegrastats-exporter.yaml    |  2 +-
 .../monitoring/scripts/jetson_tegrastats_exporter.py   | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/services/monitoring/jetson-tegrastats-exporter.yaml b/services/monitoring/jetson-tegrastats-exporter.yaml
index 6b0ce376..ba25c9fd 100644
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@@ -17,7 +17,7 @@ spec:
       annotations:
         prometheus.io/scrape: "true"
         prometheus.io/port: "9100"
-        monitoring.bstein.dev/restart-rev: "6"
+        monitoring.bstein.dev/restart-rev: "7"
     spec:
       serviceAccountName: default
       hostPID: true
diff --git a/services/monitoring/scripts/jetson_tegrastats_exporter.py b/services/monitoring/scripts/jetson_tegrastats_exporter.py
index 284d5ce3..8b361111 100644
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@@ -21,20 +21,20 @@ BASE_METRICS = {
 def parse_line(line: str) -> dict:
     line = line.strip()
     updates = {}
-    m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
+    m = re.search(r"GR3D_FREQ\s+(\d+)%", line)
     if m:
         updates["gr3d_freq_percent"] = float(m.group(1))
-    m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
+    m = re.search(r"GPU@(\d+(?:\.\d+)?)C", line)
     if m:
         updates["gpu_temp_c"] = float(m.group(1))
-    m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
+    m = re.search(r"CPU@(\d+(?:\.\d+)?)C", line)
     if m:
         updates["cpu_temp_c"] = float(m.group(1))
-    m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
+    m = re.search(r"RAM\s+(\d+)/(\d+)MB", line)
     if m:
         updates["ram_used_mb"] = float(m.group(1))
         updates["ram_total_mb"] = float(m.group(2))
-    m = re.search(r"(?:POM_5V_IN|VDD_IN)\\s+(\\d+)(?:mW)?/(\\d+)(?:mW)?", line)
+    m = re.search(r"(?:POM_5V_IN|VDD_IN)\s+(\d+)(?:mW)?/(\d+)(?:mW)?", line)
     if m:
         updates["power_5v_in_mw"] = float(m.group(1))
     return updates

From 86cd5194ead988a7ecc30a96681432c594eb54f6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 17:51:13 -0300
Subject: [PATCH 333/416] monitoring: fix gpu idle share

---
 scripts/dashboards_render_atlas.py                  | 2 +-
 services/monitoring/dashboards/atlas-gpu.json       | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 2 +-
 services/monitoring/grafana-dashboard-gpu.yaml      | 2 +-
 services/monitoring/grafana-dashboard-overview.yaml | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 6ad43218..34ded89e 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -266,7 +266,7 @@ def namespace_gpu_share_expr(scope_var):
     usage = namespace_gpu_usage_instant(scope_var)
     total = f"(sum({usage}) or on() vector(0))"
     share = f"100 * ({usage}) / clamp_min({total}, 1)"
-    idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
+    idle = 'label_replace(vector(100), "namespace", "idle", "", "") * on() (' + total + " == bool 0)"
     return f"({share}) or ({idle})"
 
 
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index 36ab9e5f..f6801aa6 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
+          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 04352f93..1a507ece 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1901,7 +1901,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
+          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index bb395dbf..dc1025b6 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
+              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 9495647f..ed63da05 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1910,7 +1910,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == 0))",
+              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From 0d4e1cac700d98c22f20697aaf187821f5a31418 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:08:19 -0300
Subject: [PATCH 334/416] atlasbot: make cluster answers more narrative

---
 services/comms/scripts/atlasbot/bot.py | 196 +++++++++++++++++++++----
 1 file changed, 165 insertions(+), 31 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index d36844bc..0dcfc606 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -181,6 +181,27 @@ CLUSTER_HINT_WORDS = {
     "arm64",
 }
 
+_INSIGHT_HINT_WORDS = {
+    "interesting",
+    "unconventional",
+    "surprising",
+    "weird",
+    "odd",
+    "fun",
+    "cool",
+    "unique",
+    "notable",
+}
+
+_OVERVIEW_HINT_WORDS = {
+    "overview",
+    "summary",
+    "describe",
+    "explain",
+    "tell me about",
+    "what do you know",
+}
+
 _OLLAMA_LOCK = threading.Lock()
 
 HARDWARE_HINTS = {
@@ -1408,7 +1429,18 @@ def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any
         not_ready = len([n for n in inventory if n.get("ready") is False])
     if total is None:
         return ""
-    return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)."
+    if not_ready:
+        names = []
+        summary_names = summary.get("not_ready_names") if isinstance(summary, dict) else []
+        if isinstance(summary_names, list):
+            names = [name for name in summary_names if isinstance(name, str)]
+        if not names and snapshot:
+            details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
+            names = [node.get("name") for node in details if isinstance(node, dict) and node.get("ready") is False]
+        names = [name for name in names if isinstance(name, str) and name]
+        suffix = f" (not ready: {', '.join(names)})" if names else ""
+        return f"Atlas has {total} nodes; {ready} ready, {not_ready} not ready{suffix}."
+    return f"Atlas has {total} nodes and all are Ready."
 
 
 def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
@@ -1422,7 +1454,7 @@ def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
             parts.append(f"{key}={len(nodes)}")
     if not parts:
         return ""
-    return "Hardware mix: " + ", ".join(parts) + "."
+    return "Hardware mix includes " + ", ".join(parts) + "."
 
 
 def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
@@ -1449,6 +1481,8 @@ def _pods_summary_line(metrics: dict[str, Any]) -> str:
     pending = metrics.get("pods_pending")
     failed = metrics.get("pods_failed")
     succeeded = metrics.get("pods_succeeded")
+    if running is None and pending is None and failed is None and succeeded is None:
+        return ""
     parts: list[str] = []
     if running is not None:
         parts.append(f"{running:.0f} running")
@@ -1458,9 +1492,7 @@ def _pods_summary_line(metrics: dict[str, Any]) -> str:
         parts.append(f"{failed:.0f} failed")
     if succeeded is not None:
         parts.append(f"{succeeded:.0f} succeeded")
-    if not parts:
-        return ""
-    return "Pods: " + ", ".join(parts) + "."
+    return "There are " + ", ".join(parts) + " pods."
 
 
 def _postgres_summary_line(metrics: dict[str, Any]) -> str:
@@ -1481,7 +1513,7 @@ def _postgres_summary_line(metrics: dict[str, Any]) -> str:
         parts.append(f"hottest {hottest.get('label')} ({hot_val_str})")
     if not parts:
         return ""
-    return "Postgres: " + ", ".join(parts) + "."
+    return "Postgres is at " + ", ".join(parts) + "."
 
 
 def _hottest_summary_line(metrics: dict[str, Any]) -> str:
@@ -1504,7 +1536,101 @@ def _hottest_summary_line(metrics: dict[str, Any]) -> str:
             parts.append(f"{key.upper()} {node} ({value_fmt})")
     if not parts:
         return ""
-    return "Hottest nodes: " + "; ".join(parts) + "."
+    return "Hot spots: " + "; ".join(parts) + "."
+
+
+def _is_insight_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    if any(word in q for word in _INSIGHT_HINT_WORDS):
+        return True
+    if "most" in q and any(word in q for word in ("unusual", "odd", "weird", "unconventional")):
+        return True
+    return False
+
+
+def _is_overview_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    return any(word in q for word in _OVERVIEW_HINT_WORDS)
+
+
+def _doc_intent(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    return any(
+        phrase in q
+        for phrase in (
+            "runbook",
+            "documentation",
+            "docs",
+            "guide",
+            "how do i",
+            "how to",
+            "instructions",
+            "playbook",
+        )
+    )
+
+
+def _insight_candidates(
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+) -> list[tuple[str, str, str]]:
+    metrics = _snapshot_metrics(snapshot)
+    candidates: list[tuple[str, str, str]] = []
+
+    nodes_line = _nodes_summary_line(inventory, snapshot)
+    if nodes_line and "not ready" in nodes_line.lower():
+        candidates.append(("availability", nodes_line, "high"))
+
+    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+    if hottest:
+        cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {}
+        if cpu.get("node") and cpu.get("value") is not None:
+            value_fmt = _format_metric_value(str(cpu.get("value")), percent=True)
+            candidates.append(("cpu", f"The busiest CPU right now is {cpu.get('node')} at about {value_fmt}.", "high"))
+        ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {}
+        if ram.get("node") and ram.get("value") is not None:
+            value_fmt = _format_metric_value(str(ram.get("value")), percent=True)
+            candidates.append(("ram", f"RAM usage peaks on {ram.get('node')} at about {value_fmt}.", "high"))
+
+    postgres_line = _postgres_summary_line(metrics)
+    if postgres_line:
+        candidates.append(("postgres", postgres_line, "high"))
+
+    hardware_line = _hardware_mix_line(inventory)
+    if hardware_line:
+        candidates.append(("hardware", hardware_line, "medium"))
+
+    pods_line = _pods_summary_line(metrics)
+    if pods_line:
+        candidates.append(("pods", pods_line, "high"))
+
+    return candidates
+
+
+def _select_insight(
+    prompt: str,
+    candidates: list[tuple[str, str, str]],
+) -> tuple[str, str] | None:
+    if not candidates:
+        return None
+    q = normalize_query(prompt)
+    prefer_keys: list[str] = []
+    if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
+        prefer_keys.extend(["hardware", "availability"])
+    if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
+        return candidates[1][1], candidates[1][2]
+    if prefer_keys:
+        for key, text, conf in candidates:
+            if key in prefer_keys:
+                return text, conf
+    key, text, conf = candidates[0]
+    return text, conf
 
 
 def cluster_overview_answer(
@@ -1517,31 +1643,21 @@ def cluster_overview_answer(
         return ""
     q = normalize_query(prompt)
     metrics = _snapshot_metrics(snapshot)
-    lines: list[str] = []
+    sentences: list[str] = []
 
     nodes_line = _nodes_summary_line(inventory, snapshot)
     if nodes_line:
-        lines.append(nodes_line)
+        sentences.append(nodes_line)
 
-    if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")):
-        hw_line = _hardware_mix_line(inventory)
-        if hw_line:
-            lines.append(hw_line)
-        os_line = _os_mix_line(snapshot)
-        if os_line:
-            lines.append(os_line)
-
-    if any(
+    wants_overview = _is_overview_query(q) or any(word in q for word in ("atlas", "cluster", "titan", "lab"))
+    wants_hardware = any(word in q for word in ("hardware", "architecture", "nodes", "node")) or wants_overview
+    wants_metrics = any(
         word in q
         for word in (
-            "interesting",
             "status",
             "health",
             "overview",
             "summary",
-            "tell me",
-            "what do you know",
-            "about",
             "pods",
             "postgres",
             "connections",
@@ -1558,20 +1674,32 @@ def cluster_overview_answer(
             "usage",
             "utilization",
         )
-    ):
+    ) or wants_overview
+
+    if wants_hardware:
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            sentences.append(hw_line)
+        os_line = _os_mix_line(snapshot)
+        if os_line:
+            sentences.append(os_line)
+
+    if wants_metrics:
         pods_line = _pods_summary_line(metrics)
         if pods_line:
-            lines.append(pods_line)
-        hottest_line = _hottest_summary_line(metrics)
-        if hottest_line:
-            lines.append(hottest_line)
+            sentences.append(pods_line)
         postgres_line = _postgres_summary_line(metrics)
         if postgres_line:
-            lines.append(postgres_line)
+            sentences.append(postgres_line)
+        hottest_line = _hottest_summary_line(metrics)
+        if hottest_line:
+            sentences.append(hottest_line)
 
-    if not lines:
+    if not sentences:
         return ""
-    return "Based on the snapshot, " + "\n".join(lines)
+    if len(sentences) > 3 and not wants_overview:
+        sentences = sentences[:3]
+    return "Based on the latest snapshot, " + " ".join(sentences)
 
 
 def cluster_answer(
@@ -1582,6 +1710,12 @@ def cluster_answer(
     workloads: list[dict[str, Any]] | None,
 ) -> str:
     metrics_summary = snapshot_context(prompt, snapshot)
+    if _is_insight_query(prompt):
+        candidates = _insight_candidates(inventory, snapshot)
+        selected = _select_insight(prompt, candidates)
+        if selected:
+            text, confidence = selected
+            return _format_confidence(text, confidence)
     structured = structured_answer(
         prompt,
         inventory=inventory,
@@ -1602,7 +1736,7 @@ def cluster_answer(
 
     overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
     if overview:
-        kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else ""
+        kb_titles = kb_retrieve_titles(prompt, limit=4) if _doc_intent(prompt) else ""
         if kb_titles:
             overview = overview + "\n" + kb_titles
         return _format_confidence(overview, "medium")

From 59979a48e52a8553f16e0e8385e98206105c766e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:10:30 -0300
Subject: [PATCH 335/416] comms: roll atlasbot after bot updates

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 301a4746..817e9361 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-51
+        checksum/atlasbot-configmap: manual-atlasbot-52
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From a39440e8721d07231e116bb01fe6bf6f575804dd Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 21:11:24 +0000
Subject: [PATCH 336/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index bb9e5f09..68eea2cb 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -22,7 +22,7 @@ images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
     newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
   - name: chat-ai-gateway
     namespace: bstein-dev-home

From 7d94896bafa96303df61039b01272b4508ae87af Mon Sep 17 00:00:00 2001
From: flux-bot <ops@bstein.dev>
Date: Tue, 27 Jan 2026 21:11:27 +0000
Subject: [PATCH 337/416] chore(bstein-dev-home): automated image update

---
 services/bstein-dev-home/kustomization.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index 68eea2cb..a8132417 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -20,7 +20,7 @@ resources:
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-161 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
   - name: registry.bstein.dev/bstein/bstein-dev-home-backend
     newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:

From e05b627c71a97b8ec96c3bf50505758631ede33c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:17:29 -0300
Subject: [PATCH 338/416] atlasbot: add narrative insights

---
 services/comms/scripts/atlasbot/bot.py | 50 ++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 0dcfc606..ada8dd7f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1616,7 +1616,7 @@ def _insight_candidates(
 def _select_insight(
     prompt: str,
     candidates: list[tuple[str, str, str]],
-) -> tuple[str, str] | None:
+) -> tuple[str, str, str] | None:
     if not candidates:
         return None
     q = normalize_query(prompt)
@@ -1624,13 +1624,43 @@ def _select_insight(
     if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
         prefer_keys.extend(["hardware", "availability"])
     if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
-        return candidates[1][1], candidates[1][2]
+        return candidates[1]
     if prefer_keys:
         for key, text, conf in candidates:
             if key in prefer_keys:
-                return text, conf
-    key, text, conf = candidates[0]
-    return text, conf
+                return key, text, conf
+    return candidates[0]
+
+
+def _format_insight_text(key: str, text: str) -> str:
+    cleaned = text.strip().rstrip(".")
+    if not cleaned:
+        return ""
+    if key == "hardware":
+        counts = cleaned.replace("Hardware mix includes ", "")
+        return f"Atlas mixes Raspberry Pi, Jetson, and AMD64 nodes ({counts})."
+    if key == "postgres":
+        detail = cleaned.replace("Postgres is at ", "")
+        return f"Postgres looks healthy at {detail}."
+    if key == "pods":
+        detail = cleaned.replace("There are ", "")
+        return f"Pods look stable with {detail}."
+    if key == "availability":
+        return cleaned + "."
+    if key in ("cpu", "ram"):
+        return cleaned + "."
+    return cleaned + "."
+
+
+def _insight_prefix(prompt: str) -> str:
+    q = normalize_query(prompt)
+    if any(word in q for word in ("another", "else", "different", "other")):
+        return "Another interesting detail: "
+    if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
+        return "What stands out is that "
+    if any(word in q for word in ("interesting", "notable", "fun", "cool")):
+        return "One notable detail: "
+    return ""
 
 
 def cluster_overview_answer(
@@ -1714,8 +1744,14 @@ def cluster_answer(
         candidates = _insight_candidates(inventory, snapshot)
         selected = _select_insight(prompt, candidates)
         if selected:
-            text, confidence = selected
-            return _format_confidence(text, confidence)
+            key, raw_text, confidence = selected
+            formatted = _format_insight_text(key, raw_text)
+            if not formatted:
+                formatted = raw_text
+            prefix = _insight_prefix(prompt)
+            if prefix:
+                formatted = prefix + formatted
+            return _format_confidence(formatted, confidence)
     structured = structured_answer(
         prompt,
         inventory=inventory,

From 4589c65c2ba5ab57721998b7f8ece2fec9a0121d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:18:06 -0300
Subject: [PATCH 339/416] comms: roll atlasbot for insight updates

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 817e9361..31e37332 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-52
+        checksum/atlasbot-configmap: manual-atlasbot-53
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 66a42aaa9323f8bb87c75428d6a1827b8c999a17 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:32:27 -0300
Subject: [PATCH 340/416] atlasbot: use history for subjective follow-ups

---
 services/comms/scripts/atlasbot/bot.py | 94 ++++++++++++++++++++++++--
 1 file changed, 89 insertions(+), 5 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index ada8dd7f..a446a10e 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -191,6 +191,10 @@ _INSIGHT_HINT_WORDS = {
     "cool",
     "unique",
     "notable",
+    "coolest",
+    "favorite",
+    "favourite",
+    "trivia",
 }
 
 _OVERVIEW_HINT_WORDS = {
@@ -1550,6 +1554,21 @@ def _is_insight_query(query: str) -> bool:
     return False
 
 
+def _is_subjective_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    return any(word in q for word in _INSIGHT_HINT_WORDS) or any(
+        phrase in q
+        for phrase in (
+            "what do you think",
+            "your favorite",
+            "your favourite",
+            "your opinion",
+        )
+    )
+
+
 def _is_overview_query(query: str) -> bool:
     q = normalize_query(query)
     if not q:
@@ -1602,9 +1621,9 @@ def _insight_candidates(
     if postgres_line:
         candidates.append(("postgres", postgres_line, "high"))
 
-    hardware_line = _hardware_mix_line(inventory)
-    if hardware_line:
-        candidates.append(("hardware", hardware_line, "medium"))
+    hardware_insight = _hardware_insight(inventory)
+    if hardware_insight:
+        candidates.append(("hardware", hardware_insight, "medium"))
 
     pods_line = _pods_summary_line(metrics)
     if pods_line:
@@ -1613,6 +1632,29 @@ def _insight_candidates(
     return candidates
 
 
+def _hardware_insight(inventory: list[dict[str, Any]]) -> str:
+    if not inventory:
+        return ""
+    groups = _group_nodes(inventory)
+    jetsons = groups.get("jetson") or []
+    rpi5 = groups.get("rpi5") or []
+    rpi4 = groups.get("rpi4") or []
+    amd64 = groups.get("amd64") or []
+    if jetsons:
+        jetson_names = ", ".join(jetsons[:2])
+        return (
+            f"Atlas mixes tiny Raspberry Pi nodes with Jetson accelerators ({jetson_names}) "
+            f"and AMD64 servers, which is unusual for a homelab cluster."
+        )
+    if amd64 and (rpi5 or rpi4):
+        return (
+            "Atlas mixes small ARM boards with a couple of AMD64 machines, "
+            "so workloads can land on either low-power or high-power nodes."
+        )
+    line = _hardware_mix_line(inventory)
+    return line.replace("Hardware mix includes ", "Atlas mixes ") if line else ""
+
+
 def _select_insight(
     prompt: str,
     candidates: list[tuple[str, str, str]],
@@ -1623,6 +1665,8 @@ def _select_insight(
     prefer_keys: list[str] = []
     if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
         prefer_keys.extend(["hardware", "availability"])
+    if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")):
+        prefer_keys.extend(["hardware", "cpu", "ram"])
     if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
         return candidates[1]
     if prefer_keys:
@@ -2284,7 +2328,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
-        cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
+        history_payload = payload.get("history") or []
+        history_lines: list[str] = []
+        if isinstance(history_payload, list):
+            for item in history_payload[-10:]:
+                if isinstance(item, dict):
+                    content = item.get("content") or item.get("message") or ""
+                    if isinstance(content, str) and content.strip():
+                        history_lines.append(content.strip())
+                elif isinstance(item, str) and item.strip():
+                    history_lines.append(item.strip())
+        history_cluster = _history_mentions_cluster(
+            history_lines,
+            inventory=inventory,
+            workloads=workloads,
+        )
+        cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) or (
+            _is_subjective_query(cleaned) and history_cluster
+        )
         context = ""
         if cluster_query:
             context = build_context(
@@ -2329,6 +2390,22 @@ history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (
 def key_for(room_id: str, sender: str, is_dm: bool):
     return (room_id, None) if is_dm else (room_id, sender)
 
+
+def _history_mentions_cluster(
+    history_lines: list[str],
+    *,
+    inventory: list[dict[str, Any]] | None = None,
+    workloads: list[dict[str, Any]] | None = None,
+) -> bool:
+    recent = [line for line in history_lines[-8:] if isinstance(line, str)]
+    for line in recent:
+        cleaned = normalize_query(line)
+        if not cleaned:
+            continue
+        if _is_cluster_query(cleaned, inventory=inventory, workloads=workloads):
+            return True
+    return False
+
 def build_context(
     prompt: str,
     *,
@@ -2734,7 +2811,14 @@ def sync_loop(token: str, room_id: str):
                 if not inventory:
                     inventory = _snapshot_inventory(snapshot)
                 workloads = _snapshot_workloads(snapshot)
-                cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
+                history_cluster = _history_mentions_cluster(
+                    history[hist_key],
+                    inventory=inventory,
+                    workloads=workloads,
+                )
+                cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) or (
+                    _is_subjective_query(cleaned_body) and history_cluster
+                )
                 context = ""
                 if cluster_query:
                     context = build_context(

From 9dbea9dd0b99be379f97bdfb2211308ecc1f5048 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:32:54 -0300
Subject: [PATCH 341/416] comms: roll atlasbot after history update

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 31e37332..03e9dc23 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-53
+        checksum/atlasbot-configmap: manual-atlasbot-54
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 9ef1cdc7a94ab0a8b7d2c9dfcf85d1d37fc4fb7f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:43:03 -0300
Subject: [PATCH 342/416] atlasbot: improve insight voice and avoid repeats

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 86 ++++++++++++++++++++-----
 2 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 03e9dc23..dc1b0bbe 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-54
+        checksum/atlasbot-configmap: manual-atlasbot-55
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index a446a10e..2616cb1b 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1640,27 +1640,49 @@ def _hardware_insight(inventory: list[dict[str, Any]]) -> str:
     rpi5 = groups.get("rpi5") or []
     rpi4 = groups.get("rpi4") or []
     amd64 = groups.get("amd64") or []
+    parts: list[str] = []
+    if rpi5:
+        parts.append(f"rpi5={len(rpi5)}")
+    if rpi4:
+        parts.append(f"rpi4={len(rpi4)}")
     if jetsons:
         jetson_names = ", ".join(jetsons[:2])
-        return (
-            f"Atlas mixes tiny Raspberry Pi nodes with Jetson accelerators ({jetson_names}) "
-            f"and AMD64 servers, which is unusual for a homelab cluster."
-        )
-    if amd64 and (rpi5 or rpi4):
-        return (
-            "Atlas mixes small ARM boards with a couple of AMD64 machines, "
-            "so workloads can land on either low-power or high-power nodes."
-        )
-    line = _hardware_mix_line(inventory)
-    return line.replace("Hardware mix includes ", "Atlas mixes ") if line else ""
+        parts.append(f"jetson={len(jetsons)} ({jetson_names})")
+    if amd64:
+        parts.append(f"amd64={len(amd64)}")
+    return ", ".join(parts)
+
+
+def _recent_insight_keys(history_lines: list[str]) -> set[str]:
+    used: set[str] = set()
+    for line in history_lines[-10:]:
+        lower = normalize_query(line)
+        if not lower:
+            continue
+        if "postgres" in lower or "connections" in lower:
+            used.add("postgres")
+        if "atlas mixes" in lower or "hardware" in lower or "rpi" in lower or "jetson" in lower:
+            used.add("hardware")
+        if "busiest cpu" in lower or "cpu right now" in lower or "cpu " in lower:
+            used.add("cpu")
+        if "ram usage" in lower or "memory" in lower:
+            used.add("ram")
+        if "pods" in lower:
+            used.add("pods")
+        if "not ready" in lower:
+            used.add("availability")
+    return used
 
 
 def _select_insight(
     prompt: str,
     candidates: list[tuple[str, str, str]],
+    *,
+    used_keys: set[str] | None = None,
 ) -> tuple[str, str, str] | None:
     if not candidates:
         return None
+    used = used_keys or set()
     q = normalize_query(prompt)
     prefer_keys: list[str] = []
     if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
@@ -1668,11 +1690,21 @@ def _select_insight(
     if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")):
         prefer_keys.extend(["hardware", "cpu", "ram"])
     if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
+        for candidate in candidates:
+            if candidate[0] not in used:
+                return candidate
         return candidates[1]
     if prefer_keys:
+        for key, text, conf in candidates:
+            if key in prefer_keys and key not in used:
+                return key, text, conf
         for key, text, conf in candidates:
             if key in prefer_keys:
                 return key, text, conf
+    if used:
+        for candidate in candidates:
+            if candidate[0] not in used:
+                return candidate
     return candidates[0]
 
 
@@ -1681,29 +1713,45 @@ def _format_insight_text(key: str, text: str) -> str:
     if not cleaned:
         return ""
     if key == "hardware":
-        counts = cleaned.replace("Hardware mix includes ", "")
-        return f"Atlas mixes Raspberry Pi, Jetson, and AMD64 nodes ({counts})."
+        counts = (
+            cleaned.replace("Hardware mix includes ", "")
+            .replace("Atlas mixes tiny ", "")
+            .replace("Atlas mixes ", "")
+            .replace("which is unusual for a homelab cluster", "")
+            .strip()
+            .strip(".")
+        )
+        return f"the mixed hardware stack ({counts}) is a bit unconventional for a homelab."
     if key == "postgres":
         detail = cleaned.replace("Postgres is at ", "")
-        return f"Postgres looks healthy at {detail}."
+        return f"Postgres looks healthy at {detail}; that suggests moderate load."
     if key == "pods":
         detail = cleaned.replace("There are ", "")
         return f"Pods look stable with {detail}."
     if key == "availability":
         return cleaned + "."
     if key in ("cpu", "ram"):
-        return cleaned + "."
+        suffix = " That likely marks the busiest workload right now." if key == "cpu" else " That box is carrying the heaviest memory load."
+        return cleaned + "." + suffix
     return cleaned + "."
 
 
 def _insight_prefix(prompt: str) -> str:
     q = normalize_query(prompt)
+    if "coolest" in q:
+        return "If I had to pick the coolest detail, it's "
+    if "favorite" in q or "favourite" in q:
+        return "My favorite detail is "
+    if "trivia" in q:
+        return "A bit of trivia I like: "
+    if "most interesting" in q:
+        return "The most interesting detail to me is "
     if any(word in q for word in ("another", "else", "different", "other")):
         return "Another interesting detail: "
     if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
         return "What stands out is that "
     if any(word in q for word in ("interesting", "notable", "fun", "cool")):
-        return "One notable detail: "
+        return "One thing I'd highlight is "
     return ""
 
 
@@ -1782,11 +1830,13 @@ def cluster_answer(
     inventory: list[dict[str, Any]],
     snapshot: dict[str, Any] | None,
     workloads: list[dict[str, Any]] | None,
+    history_lines: list[str] | None = None,
 ) -> str:
     metrics_summary = snapshot_context(prompt, snapshot)
     if _is_insight_query(prompt):
         candidates = _insight_candidates(inventory, snapshot)
-        selected = _select_insight(prompt, candidates)
+        used_keys = _recent_insight_keys(history_lines or [])
+        selected = _select_insight(prompt, candidates, used_keys=used_keys)
         if selected:
             key, raw_text, confidence = selected
             formatted = _format_insight_text(key, raw_text)
@@ -2363,6 +2413,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 inventory=inventory,
                 snapshot=snapshot,
                 workloads=workloads,
+                history_lines=history_lines,
             )
             if not answer:
                 answer = fallback
@@ -2843,6 +2894,7 @@ def sync_loop(token: str, room_id: str):
                         inventory=inventory,
                         snapshot=snapshot,
                         workloads=workloads,
+                        history_lines=history[hist_key],
                     )
                     if not reply:
                         reply = fallback

From 577e2a158d394ae1c3d60ad3426f00fdc86720aa Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:44:58 -0300
Subject: [PATCH 343/416] monitoring: keep idle label in gpu share

---
 scripts/dashboards_render_atlas.py                  | 2 +-
 services/monitoring/dashboards/atlas-gpu.json       | 2 +-
 services/monitoring/dashboards/atlas-overview.json  | 2 +-
 services/monitoring/grafana-dashboard-gpu.yaml      | 2 +-
 services/monitoring/grafana-dashboard-overview.yaml | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 34ded89e..445de94b 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -266,7 +266,7 @@ def namespace_gpu_share_expr(scope_var):
     usage = namespace_gpu_usage_instant(scope_var)
     total = f"(sum({usage}) or on() vector(0))"
     share = f"100 * ({usage}) / clamp_min({total}, 1)"
-    idle = 'label_replace(vector(100), "namespace", "idle", "", "") * on() (' + total + " == bool 0)"
+    idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
     return f"({share}) or ({idle})"
 
 
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index f6801aa6..132f2766 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 1a507ece..b212c8cd 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1901,7 +1901,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index dc1025b6..55f63e84 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index ed63da05..a8990024 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1910,7 +1910,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * on() ((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From ff5cfd27a1837f462c7e78d5abdaa71041387052 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:45:49 -0300
Subject: [PATCH 344/416] atlasbot: tighten insight phrasing

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index dc1b0bbe..4a3949da 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-55
+        checksum/atlasbot-configmap: manual-atlasbot-56
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 2616cb1b..9beff7f6 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1721,17 +1721,21 @@ def _format_insight_text(key: str, text: str) -> str:
             .strip()
             .strip(".")
         )
-        return f"the mixed hardware stack ({counts}) is a bit unconventional for a homelab."
+        return f"mixed hardware stack ({counts}), which is unusual for a homelab."
     if key == "postgres":
         detail = cleaned.replace("Postgres is at ", "")
-        return f"Postgres looks healthy at {detail}; that suggests moderate load."
+        return f"Postgres is at {detail}; that suggests moderate load."
     if key == "pods":
         detail = cleaned.replace("There are ", "")
         return f"Pods look stable with {detail}."
     if key == "availability":
         return cleaned + "."
     if key in ("cpu", "ram"):
-        suffix = " That likely marks the busiest workload right now." if key == "cpu" else " That box is carrying the heaviest memory load."
+        suffix = (
+            " That likely marks the busiest workload right now."
+            if key == "cpu"
+            else " That box is carrying the heaviest memory load."
+        )
         return cleaned + "." + suffix
     return cleaned + "."
 
@@ -1739,19 +1743,19 @@ def _format_insight_text(key: str, text: str) -> str:
 def _insight_prefix(prompt: str) -> str:
     q = normalize_query(prompt)
     if "coolest" in q:
-        return "If I had to pick the coolest detail, it's "
+        return "If I had to pick the coolest detail: "
     if "favorite" in q or "favourite" in q:
-        return "My favorite detail is "
+        return "My favorite detail: "
     if "trivia" in q:
         return "A bit of trivia I like: "
     if "most interesting" in q:
-        return "The most interesting detail to me is "
+        return "The most interesting detail to me: "
     if any(word in q for word in ("another", "else", "different", "other")):
         return "Another interesting detail: "
     if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
         return "What stands out is that "
     if any(word in q for word in ("interesting", "notable", "fun", "cool")):
-        return "One thing I'd highlight is "
+        return "One thing I'd highlight: "
     return ""
 
 

From c02973e5a6aecf63a70051a6324182761b11b61d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:48:35 -0300
Subject: [PATCH 345/416] atlasbot: add more opinionated hardware insight

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 4a3949da..d02255e3 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-56
+        checksum/atlasbot-configmap: manual-atlasbot-57
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 9beff7f6..54434e71 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1721,7 +1721,9 @@ def _format_insight_text(key: str, text: str) -> str:
             .strip()
             .strip(".")
         )
-        return f"mixed hardware stack ({counts}), which is unusual for a homelab."
+        detail = f"mixed hardware stack ({counts})"
+        flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 nodes."
+        return f"{detail}. {flavor}"
     if key == "postgres":
         detail = cleaned.replace("Postgres is at ", "")
         return f"Postgres is at {detail}; that suggests moderate load."
@@ -1732,9 +1734,9 @@ def _format_insight_text(key: str, text: str) -> str:
         return cleaned + "."
     if key in ("cpu", "ram"):
         suffix = (
-            " That likely marks the busiest workload right now."
+            " If you're chasing hotspots, that's the busiest workload right now."
             if key == "cpu"
-            else " That box is carrying the heaviest memory load."
+            else " That box is carrying the heaviest memory load right now."
         )
         return cleaned + "." + suffix
     return cleaned + "."

From 5553871d3351f81b7d27d65f63a598aa75eb1eb9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:51:00 -0300
Subject: [PATCH 346/416] atlasbot: make insights sound more human

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d02255e3..2c0b84d6 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-57
+        checksum/atlasbot-configmap: manual-atlasbot-58
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 54434e71..659ea495 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1729,9 +1729,9 @@ def _format_insight_text(key: str, text: str) -> str:
         return f"Postgres is at {detail}; that suggests moderate load."
     if key == "pods":
         detail = cleaned.replace("There are ", "")
-        return f"Pods look stable with {detail}."
+        return f"Pods look steady ({detail}); the workload mix looks healthy."
     if key == "availability":
-        return cleaned + "."
+        return cleaned + " That suggests the cluster is stable right now."
     if key in ("cpu", "ram"):
         suffix = (
             " If you're chasing hotspots, that's the busiest workload right now."

From f175273a6cf947fc112c43020f5a2c096e2c9fb2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:54:05 -0300
Subject: [PATCH 347/416] atlasbot: use hottest node labels for insights

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 2c0b84d6..1212505c 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-58
+        checksum/atlasbot-configmap: manual-atlasbot-59
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 659ea495..7f92d8ec 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1608,14 +1608,26 @@ def _insight_candidates(
 
     hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
     if hottest:
+        def _hot_node(entry: dict[str, Any]) -> str:
+            if not isinstance(entry, dict):
+                return ""
+            return (
+                entry.get("node")
+                or entry.get("label")
+                or (entry.get("metric") or {}).get("node")
+                or ""
+            )
+
         cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {}
-        if cpu.get("node") and cpu.get("value") is not None:
+        cpu_node = _hot_node(cpu)
+        if cpu_node and cpu.get("value") is not None:
             value_fmt = _format_metric_value(str(cpu.get("value")), percent=True)
-            candidates.append(("cpu", f"The busiest CPU right now is {cpu.get('node')} at about {value_fmt}.", "high"))
+            candidates.append(("cpu", f"The busiest CPU right now is {cpu_node} at about {value_fmt}.", "high"))
         ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {}
-        if ram.get("node") and ram.get("value") is not None:
+        ram_node = _hot_node(ram)
+        if ram_node and ram.get("value") is not None:
             value_fmt = _format_metric_value(str(ram.get("value")), percent=True)
-            candidates.append(("ram", f"RAM usage peaks on {ram.get('node')} at about {value_fmt}.", "high"))
+            candidates.append(("ram", f"RAM usage peaks on {ram_node} at about {value_fmt}.", "high"))
 
     postgres_line = _postgres_summary_line(metrics)
     if postgres_line:

From 39fb7e5eb47bb3be71cc78124df981f9ac6c3932 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:56:14 -0300
Subject: [PATCH 348/416] atlasbot: prioritize hardware for subjective prompts

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 1212505c..cbc79e54 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-59
+        checksum/atlasbot-configmap: manual-atlasbot-60
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 7f92d8ec..613b0c6f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1707,12 +1707,14 @@ def _select_insight(
                 return candidate
         return candidates[1]
     if prefer_keys:
-        for key, text, conf in candidates:
-            if key in prefer_keys and key not in used:
-                return key, text, conf
-        for key, text, conf in candidates:
-            if key in prefer_keys:
-                return key, text, conf
+        for prefer in prefer_keys:
+            for key, text, conf in candidates:
+                if key == prefer and key not in used:
+                    return key, text, conf
+        for prefer in prefer_keys:
+            for key, text, conf in candidates:
+                if key == prefer:
+                    return key, text, conf
     if used:
         for candidate in candidates:
             if candidate[0] not in used:

From d35cb0c6c39e9e6cb1489556a678b329169afcff Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 18:58:59 -0300
Subject: [PATCH 349/416] atlasbot: keep coolest answers opinionated

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index cbc79e54..ef6b88b6 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-60
+        checksum/atlasbot-configmap: manual-atlasbot-61
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 613b0c6f..9434e913 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1701,6 +1701,7 @@ def _select_insight(
         prefer_keys.extend(["hardware", "availability"])
     if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")):
         prefer_keys.extend(["hardware", "cpu", "ram"])
+    avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q
     if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
         for candidate in candidates:
             if candidate[0] not in used:
@@ -1709,13 +1710,13 @@ def _select_insight(
     if prefer_keys:
         for prefer in prefer_keys:
             for key, text, conf in candidates:
-                if key == prefer and key not in used:
+                if key == prefer and (not avoid_used or key not in used):
                     return key, text, conf
         for prefer in prefer_keys:
             for key, text, conf in candidates:
                 if key == prefer:
                     return key, text, conf
-    if used:
+    if used and avoid_used:
         for candidate in candidates:
             if candidate[0] not in used:
                 return candidate

From 61de9d400b7cf4f828b22bd64a3db9c95838b03e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:01:16 -0300
Subject: [PATCH 350/416] atlasbot: prefer hardware for general interest

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index ef6b88b6..e8e22a36 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-61
+        checksum/atlasbot-configmap: manual-atlasbot-62
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 9434e913..f9e6b818 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1701,6 +1701,8 @@ def _select_insight(
         prefer_keys.extend(["hardware", "availability"])
     if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")):
         prefer_keys.extend(["hardware", "cpu", "ram"])
+    if "interesting" in q and "most interesting" not in q:
+        prefer_keys.extend(["hardware", "postgres", "cpu", "ram"])
     avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q
     if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
         for candidate in candidates:

From 9361d003ff5cee8b77bac5a270fcc9cb024c876a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:04:29 -0300
Subject: [PATCH 351/416] atlasbot: treat hardware prompts as cluster queries

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index e8e22a36..36bb1dbf 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-62
+        checksum/atlasbot-configmap: manual-atlasbot-63
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index f9e6b818..4ca3b2ee 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -139,6 +139,8 @@ CLUSTER_HINT_WORDS = {
     "kubernetes",
     "node",
     "nodes",
+    "hardware",
+    "architecture",
     "worker",
     "workers",
     "pod",

From c5b24119d301361f1eba4f271fd2bb652d3be0e0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:06:44 -0300
Subject: [PATCH 352/416] atlasbot: answer hardware mix queries

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 36bb1dbf..9cc0a1e9 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-63
+        checksum/atlasbot-configmap: manual-atlasbot-64
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 4ca3b2ee..570bc26f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1292,6 +1292,11 @@ def structured_answer(
     if not op and entity == "node":
         op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
 
+    if entity == "node" and ("hardware mix" in q or "architecture" in q):
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            return _format_confidence(hw_line, "high")
+
     if op == "top" and metric is None:
         metric = "cpu"
 

From d721368f5109e7ddc4e7eb27d5b0246251508325 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:10:02 -0300
Subject: [PATCH 353/416] atlasbot: expand hardware and entity detection

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 9cc0a1e9..72503b81 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-64
+        checksum/atlasbot-configmap: manual-atlasbot-65
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 570bc26f..2b3657a9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -197,6 +197,8 @@ _INSIGHT_HINT_WORDS = {
     "favorite",
     "favourite",
     "trivia",
+    "stand out",
+    "stands out",
 }
 
 _OVERVIEW_HINT_WORDS = {
@@ -213,8 +215,8 @@ _OLLAMA_LOCK = threading.Lock()
 HARDWARE_HINTS = {
     "amd64": ("amd64", "x86", "x86_64", "x86-64"),
     "jetson": ("jetson",),
-    "rpi4": ("rpi4",),
-    "rpi5": ("rpi5",),
+    "rpi4": ("rpi4", "raspberry pi 4", "raspberry pi-4"),
+    "rpi5": ("rpi5", "raspberry pi 5", "raspberry pi-5"),
     "rpi": ("rpi", "raspberry"),
     "arm64": ("arm64", "aarch64"),
 }
@@ -559,7 +561,16 @@ def _detect_role_filters(q: str) -> set[str]:
     return roles
 
 def _detect_entity(q: str) -> str | None:
-    if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q):
+    if (
+        "node" in q
+        or "nodes" in q
+        or "worker" in q
+        or "hardware" in q
+        or "architecture" in q
+        or "machine" in q
+        or "machines" in q
+        or TITAN_NODE_RE.search(q)
+    ):
         return "node"
     if "pod" in q or "pods" in q:
         return "pod"

From a3fdf20e39d7c82cf03fe5ab482c3e37c043113c Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:13:31 -0300
Subject: [PATCH 354/416] atlasbot: refine node and postgres query handling

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 72503b81..e1ff2bb4 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-65
+        checksum/atlasbot-configmap: manual-atlasbot-66
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 2b3657a9..abdcbf25 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -538,7 +538,17 @@ def _detect_metric(q: str) -> str | None:
 def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
     include: set[str] = set()
     exclude: set[str] = set()
-    rpi_specific = "rpi4" in q or "rpi5" in q
+    rpi_specific = any(
+        phrase in q
+        for phrase in (
+            "rpi4",
+            "rpi5",
+            "raspberry pi 4",
+            "raspberry pi 5",
+            "raspberry pi-4",
+            "raspberry pi-5",
+        )
+    )
     for hardware, phrases in HARDWARE_HINTS.items():
         if hardware == "rpi" and rpi_specific:
             continue
@@ -1226,7 +1236,11 @@ def snapshot_metric_answer(
         hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
         parts: list[str] = []
         if used is not None and max_conn is not None:
-            parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.")
+            free = max_conn - used
+            if any(word in q for word in ("free", "available", "remaining")):
+                parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).")
+            else:
+                parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.")
         if hottest.get("label"):
             hot_val = hottest.get("value")
             hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
@@ -1303,6 +1317,11 @@ def structured_answer(
     if not op and entity == "node":
         op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
 
+    if entity == "node" and "total" in q and "ready" in q:
+        summary = _nodes_summary_line(inventory, snapshot)
+        if summary:
+            return _format_confidence(summary, "high")
+
     if entity == "node" and ("hardware mix" in q or "architecture" in q):
         hw_line = _hardware_mix_line(inventory)
         if hw_line:

From 3a131fa1fc1b922601d9d48b203f5f1c9cd2022e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:37:20 -0300
Subject: [PATCH 355/416] atlasbot: strengthen subjective insights

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 58 +++++++++++++++++--------
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index e1ff2bb4..4ac3582d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-66
+        checksum/atlasbot-configmap: manual-atlasbot-67
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index abdcbf25..0d0f92be 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -579,6 +579,10 @@ def _detect_entity(q: str) -> str | None:
         or "architecture" in q
         or "machine" in q
         or "machines" in q
+        or "host" in q
+        or "hosts" in q
+        or "hostname" in q
+        or "hostnames" in q
         or TITAN_NODE_RE.search(q)
     ):
         return "node"
@@ -1775,20 +1779,29 @@ def _format_insight_text(key: str, text: str) -> str:
             .strip()
             .strip(".")
         )
+        has_jetson = "jetson=" in counts
+        has_amd64 = "amd64=" in counts
         detail = f"mixed hardware stack ({counts})"
-        flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 nodes."
+        if has_jetson and has_amd64:
+            flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 boxes."
+        elif has_jetson:
+            flavor = "It pairs low-power Pis with Jetson accelerators for edge and AI workloads."
+        elif has_amd64:
+            flavor = "It mixes low-power Pis with a couple of heavier AMD64 nodes."
+        else:
+            flavor = "It is a pretty uniform hardware stack, which is rare for a homelab."
         return f"{detail}. {flavor}"
     if key == "postgres":
         detail = cleaned.replace("Postgres is at ", "")
-        return f"Postgres is at {detail}; that suggests moderate load."
+        return f"Postgres is at {detail}; that feels like healthy, steady load rather than strain."
     if key == "pods":
         detail = cleaned.replace("There are ", "")
-        return f"Pods look steady ({detail}); the workload mix looks healthy."
+        return f"Pods look steady ({detail}); nothing looks stuck or unhealthy."
     if key == "availability":
-        return cleaned + " That suggests the cluster is stable right now."
+        return cleaned + " That is the kind of stability I like to see."
     if key in ("cpu", "ram"):
         suffix = (
-            " If you're chasing hotspots, that's the busiest workload right now."
+            " If you're chasing hotspots, that's the node I'd watch first."
             if key == "cpu"
             else " That box is carrying the heaviest memory load right now."
         )
@@ -1799,19 +1812,19 @@ def _format_insight_text(key: str, text: str) -> str:
 def _insight_prefix(prompt: str) -> str:
     q = normalize_query(prompt)
     if "coolest" in q:
-        return "If I had to pick the coolest detail: "
+        return "If I had to pick the coolest detail, I'd say "
     if "favorite" in q or "favourite" in q:
-        return "My favorite detail: "
+        return "My favorite detail is "
     if "trivia" in q:
         return "A bit of trivia I like: "
     if "most interesting" in q:
-        return "The most interesting detail to me: "
+        return "The most interesting detail to me is "
     if any(word in q for word in ("another", "else", "different", "other")):
         return "Another interesting detail: "
     if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
-        return "What stands out is that "
+        return "What stands out to me is that "
     if any(word in q for word in ("interesting", "notable", "fun", "cool")):
-        return "One thing I'd highlight: "
+        return "One thing I'd call out is "
     return ""
 
 
@@ -2389,6 +2402,21 @@ def _normalize_reply(value: Any) -> str:
     return _ensure_confidence(text)
 
 
+def _history_payload_lines(history_payload: list[Any]) -> list[str]:
+    lines: list[str] = []
+    if not isinstance(history_payload, list):
+        return lines
+    for item in history_payload[-12:]:
+        if isinstance(item, dict):
+            for key in ("content", "message", "text", "prompt", "question", "body", "answer", "reply", "response"):
+                val = item.get(key)
+                if isinstance(val, str) and val.strip():
+                    lines.append(val.strip())
+        elif isinstance(item, str) and item.strip():
+            lines.append(item.strip())
+    return [line for line in lines if line]
+
+
 # Internal HTTP endpoint for cluster answers (website uses this).
 class _AtlasbotHandler(BaseHTTPRequestHandler):
     server_version = "AtlasbotHTTP/1.0"
@@ -2439,15 +2467,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
         history_payload = payload.get("history") or []
-        history_lines: list[str] = []
-        if isinstance(history_payload, list):
-            for item in history_payload[-10:]:
-                if isinstance(item, dict):
-                    content = item.get("content") or item.get("message") or ""
-                    if isinstance(content, str) and content.strip():
-                        history_lines.append(content.strip())
-                elif isinstance(item, str) and item.strip():
-                    history_lines.append(item.strip())
+        history_lines = _history_payload_lines(history_payload)
         history_cluster = _history_mentions_cluster(
             history_lines,
             inventory=inventory,

From 52d28dcc6d407935314302a72f8d0aca06748d9e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 19:42:04 -0300
Subject: [PATCH 356/416] atlasbot: refine insight tone and status

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 4ac3582d..609c2450 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-67
+        checksum/atlasbot-configmap: manual-atlasbot-68
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 0d0f92be..db0f5609 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -118,7 +118,7 @@ CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECAS
 OPERATION_HINTS = {
     "count": ("how many", "count", "number", "total"),
     "list": ("list", "which", "what are", "show", "names"),
-    "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum"),
+    "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"),
     "status": ("ready", "not ready", "unready", "down", "missing", "status"),
 }
 
@@ -1414,6 +1414,11 @@ def structured_answer(
     names = [node["name"] for node in filtered]
 
     if op == "status":
+        if "missing" in q and ("ready" in q or "readiness" in q):
+            return _format_confidence(
+                "Not ready nodes: " + (", ".join(names) if names else "none") + ".",
+                "high",
+            )
         if "missing" in q and expected_workers:
             missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
             return _format_confidence(

From b7d957ecd84b51eed8dc1699980dc09b255a9cf5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 20:02:09 -0300
Subject: [PATCH 357/416] atlasbot: route subjective queries to LLM

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 60 +++++++++++++++++--------
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 609c2450..d8ce3ee8 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-68
+        checksum/atlasbot-configmap: manual-atlasbot-69
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index db0f5609..141b9714 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1911,19 +1911,6 @@ def cluster_answer(
     history_lines: list[str] | None = None,
 ) -> str:
     metrics_summary = snapshot_context(prompt, snapshot)
-    if _is_insight_query(prompt):
-        candidates = _insight_candidates(inventory, snapshot)
-        used_keys = _recent_insight_keys(history_lines or [])
-        selected = _select_insight(prompt, candidates, used_keys=used_keys)
-        if selected:
-            key, raw_text, confidence = selected
-            formatted = _format_insight_text(key, raw_text)
-            if not formatted:
-                formatted = raw_text
-            prefix = _insight_prefix(prompt)
-            if prefix:
-                formatted = prefix + formatted
-            return _format_confidence(formatted, confidence)
     structured = structured_answer(
         prompt,
         inventory=inventory,
@@ -2422,6 +2409,17 @@ def _history_payload_lines(history_payload: list[Any]) -> list[str]:
     return [line for line in lines if line]
 
 
+def _append_history_context(context: str, history_lines: list[str]) -> str:
+    lines = [line.strip() for line in history_lines if isinstance(line, str) and line.strip()]
+    if not lines:
+        return context
+    snippet = "\n".join(lines[-6:])
+    combined = context + "\nRecent chat:\n" + snippet if context else "Recent chat:\n" + snippet
+    if len(combined) > MAX_CONTEXT_CHARS:
+        combined = combined[: MAX_CONTEXT_CHARS - 3].rstrip() + "..."
+    return combined
+
+
 # Internal HTTP endpoint for cluster answers (website uses this).
 class _AtlasbotHandler(BaseHTTPRequestHandler):
     server_version = "AtlasbotHTTP/1.0"
@@ -2493,15 +2491,25 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             )
         fallback = "I don't have enough data to answer that."
         if cluster_query:
-            answer = cluster_answer(
+            facts_answer = cluster_answer(
                 cleaned,
                 inventory=inventory,
                 snapshot=snapshot,
                 workloads=workloads,
                 history_lines=history_lines,
             )
-            if not answer:
-                answer = fallback
+            open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned)
+            if open_ended:
+                llm_context = _append_history_context(context, history_lines)
+                answer = ollama_reply(
+                    ("http", "internal"),
+                    cleaned,
+                    context=llm_context,
+                    fallback=facts_answer or fallback,
+                    use_history=False,
+                )
+            else:
+                answer = facts_answer or fallback
         else:
             llm_prompt = cleaned
             answer = ollama_reply(
@@ -2761,11 +2769,13 @@ def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = Tru
         "When a cluster snapshot is provided, never answer about unrelated meanings of 'Atlas' (maps, mythology, Apache Atlas, etc). "
         "Treat 'hottest' as highest utilization (CPU/RAM/NET/IO) rather than temperature. "
         "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
+        "For subjective prompts (interesting, favorite, unconventional), pick one or two observations from the context, explain why they stand out in 1-2 sentences, and avoid repeating the same observation as the last response if you can. "
         "Prefer exact repo paths and Kubernetes resource names when relevant. "
         "Never include or request secret values. "
         "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
         "Translate metrics into natural language instead of echoing raw label/value pairs. "
+        "Avoid bare lists unless the user asked for a list; weave numbers into sentences. "
         "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. "
         "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. "
         "If the answer is not grounded in the provided context or tool data, say you do not know. "
@@ -2974,15 +2984,27 @@ def sync_loop(token: str, room_id: str):
                 fallback = "I don't have enough data to answer that."
 
                 if cluster_query:
-                    reply = cluster_answer(
+                    facts_answer = cluster_answer(
                         cleaned_body,
                         inventory=inventory,
                         snapshot=snapshot,
                         workloads=workloads,
                         history_lines=history[hist_key],
                     )
-                    if not reply:
-                        reply = fallback
+                    open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body)
+                    if open_ended:
+                        llm_context = _append_history_context(context, history[hist_key])
+                        reply = ollama_reply_with_thinking(
+                            token,
+                            rid,
+                            hist_key,
+                            cleaned_body,
+                            context=llm_context,
+                            fallback=facts_answer or fallback,
+                            use_history=False,
+                        )
+                    else:
+                        reply = facts_answer or fallback
                 else:
                     llm_prompt = cleaned_body
                     reply = ollama_reply_with_thinking(

From df56eeddb32e7a0ab503a2bf10c92da70bdcb0d6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:02:20 -0300
Subject: [PATCH 358/416] atlasbot: refine open-ended reasoning pipeline

---
 services/comms/atlasbot-deployment.yaml |   6 +-
 services/comms/scripts/atlasbot/bot.py  | 446 +++++++++++++++++++++---
 2 files changed, 401 insertions(+), 51 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d8ce3ee8..cc628dd9 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-69
+        checksum/atlasbot-configmap: manual-atlasbot-70
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -78,11 +78,11 @@ spec:
             - name: BOT_USER
               value: atlasbot
             - name: BOT_MENTIONS
-              value: atlasbot,aatlasbot
+              value: atlasbot,aatlasbot,atlas_quick,atlas_smart
             - name: OLLAMA_URL
               value: http://ollama.ai.svc.cluster.local:11434
             - name: OLLAMA_MODEL
-              value: qwen2.5:14b-instruct-q4_0
+              value: qwen2.5:14b-instruct
             - name: OLLAMA_TIMEOUT_SEC
               value: "600"
             - name: ATLASBOT_THINKING_INTERVAL_SEC
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 141b9714..aa7e6148 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -333,6 +333,19 @@ def _strip_bot_mention(text: str) -> str:
     return cleaned or text.strip()
 
 
+def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
+    lower = normalize_query(body or "")
+    if "atlas_quick" in lower or "atlas-quick" in lower:
+        return "fast"
+    if "atlas_smart" in lower or "atlas-smart" in lower:
+        return "deep"
+    if lower.startswith("quick ") or lower.startswith("fast "):
+        return "fast"
+    if lower.startswith("smart ") or lower.startswith("deep "):
+        return "deep"
+    return default
+
+
 # Matrix HTTP helper.
 def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
     url = (base or BASE) + path
@@ -2420,6 +2433,300 @@ def _append_history_context(context: str, history_lines: list[str]) -> str:
     return combined
 
 
+class ThoughtState:
+    def __init__(self, total_steps: int = 0):
+        self._lock = threading.Lock()
+        self.stage = "starting"
+        self.note = ""
+        self.step = 0
+        self.total_steps = total_steps
+
+    def update(self, stage: str, *, note: str = "", step: int | None = None) -> None:
+        with self._lock:
+            self.stage = stage
+            if note:
+                self.note = note
+            if step is not None:
+                self.step = step
+
+    def status_line(self) -> str:
+        with self._lock:
+            stage = self.stage
+            note = self.note
+            step = self.step
+            total = self.total_steps
+        step_part = f"{step}/{total}" if total else str(step) if step else ""
+        detail = f"Stage {step_part}: {stage}".strip()
+        if note:
+            return f"Still thinking ({detail}). Latest insight: {note}"
+        return f"Still thinking ({detail})."
+
+
+def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[str, Any]:
+    system = (
+        "System: You are Atlas, a reasoning assistant. "
+        "Return strict JSON only (no code fences, no trailing commentary). "
+        "If you cannot comply, return {}. "
+        "Only use facts from the provided context. "
+        "If you make an inference, label it as 'inference' in the JSON."
+    )
+    last_exc: Exception | None = None
+    for attempt in range(max(1, retries + 1)):
+        try:
+            raw = _ollama_call(
+                ("json", "internal"),
+                prompt,
+                context=context,
+                use_history=False,
+                system_override=system,
+            )
+            cleaned = _strip_code_fence(raw).strip()
+            if cleaned.startswith("{") and cleaned.endswith("}"):
+                return json.loads(cleaned)
+            last = json.loads(_strip_code_fence(cleaned))
+            if isinstance(last, dict):
+                return last
+        except Exception as exc:  # noqa: BLE001
+            last_exc = exc
+            time.sleep(min(2, 2 ** attempt))
+    if last_exc:
+        return {}
+    return {}
+
+
+def _fact_pack_lines(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]] | None,
+) -> list[str]:
+    raw = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    lines: list[str] = []
+    for line in raw.splitlines():
+        trimmed = line.strip()
+        if not trimmed or trimmed.lower().startswith("facts"):
+            continue
+        lines.append(trimmed)
+    return lines
+
+
+def _fact_pack_text(lines: list[str]) -> str:
+    labeled = [f"F{idx + 1}: {line}" for idx, line in enumerate(lines)]
+    return "Fact pack:\n" + "\n".join(labeled)
+
+
+def _open_ended_system() -> str:
+    return (
+        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
+        "Use ONLY the provided fact pack and recent chat as your evidence. "
+        "You may draw light inferences if you label them as such. "
+        "Write concise, human sentences, not a list. "
+        "If the question is subjective, share a light opinion grounded in facts. "
+        "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
+        "Avoid repeating the exact same observation as the last response if possible. "
+        "Do not invent numbers or facts. "
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100)."
+    )
+
+
+def _candidate_note(candidate: dict[str, Any]) -> str:
+    claim = str(candidate.get("claim") or candidate.get("summary") or "")
+    return claim[:160] + ("…" if len(claim) > 160 else "")
+
+
+def _ensure_scores(answer: str) -> str:
+    text = answer.strip()
+    lines = [line for line in text.splitlines() if line.strip()]
+    has_relevance = any(line.lower().startswith("relevance:") for line in lines)
+    has_satisfaction = any(line.lower().startswith("satisfaction:") for line in lines)
+    has_confidence = any("confidence:" in line.lower() for line in lines)
+    if not has_confidence:
+        lines.append("Confidence: medium")
+    if not has_relevance:
+        lines.append("Relevance: 70")
+    if not has_satisfaction:
+        lines.append("Satisfaction: 70")
+    return "\n".join(lines)
+
+
+def _open_ended_fast(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    state: ThoughtState | None = None,
+) -> str:
+    if state:
+        state.update("synthesizing", step=2)
+    synthesis_prompt = (
+        "You are given a question and a fact pack. "
+        "Answer in 2-4 sentences, using only facts from the pack. "
+        "Pick one or two facts that best fit the question and explain why they matter. "
+        "If the question is subjective, add a light opinion grounded in those facts. "
+        "Do not list raw facts; speak naturally. "
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n"
+        f"Question: {prompt}"
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    reply = _ollama_call(
+        ("fast", "open"),
+        synthesis_prompt,
+        context=context,
+        use_history=False,
+        system_override=_open_ended_system(),
+    )
+    return _ensure_scores(reply)
+
+
+def _interpret_open_question(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+) -> dict[str, Any]:
+    prompt_text = (
+        "Analyze the question against the fact pack. "
+        "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\","
+        "\"notes\":\"...\"}. "
+        "Use only the fact pack."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
+    if not isinstance(analysis, dict):
+        return {"focus": "cluster snapshot", "preference": "balanced", "notes": ""}
+    preference = analysis.get("preference") or "balanced"
+    if preference not in ("balanced", "novelty", "utilization", "stability", "risk"):
+        preference = "balanced"
+    analysis["preference"] = preference
+    analysis.setdefault("focus", "cluster snapshot")
+    analysis.setdefault("notes", "")
+    return analysis
+
+
+def _select_insights(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    state: ThoughtState,
+) -> list[dict[str, Any]]:
+    insight_prompt = (
+        "From the fact pack, select 3-5 candidate insights that could answer the question. "
+        "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"],"
+        "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\"}]}. "
+        "Use only the fact pack."
+    )
+    state.update("drafting candidates", step=2)
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context)
+    insights = result.get("insights") if isinstance(result, dict) else None
+    if not isinstance(insights, list):
+        return []
+    cleaned: list[dict[str, Any]] = []
+    for item in insights:
+        if not isinstance(item, dict):
+            continue
+        if not item.get("summary") or not item.get("fact_ids"):
+            continue
+        cleaned.append(item)
+        state.update("drafting candidates", step=2, note=_candidate_note(item))
+    return cleaned
+
+
+def _score_insight(insight: dict[str, Any], preference: str) -> float:
+    relevance = insight.get("relevance") if isinstance(insight.get("relevance"), (int, float)) else 0.0
+    novelty = insight.get("novelty") if isinstance(insight.get("novelty"), (int, float)) else 0.0
+    if preference == "novelty":
+        return 0.4 * relevance + 0.6 * novelty
+    if preference == "utilization":
+        return 0.7 * relevance + 0.3 * novelty
+    if preference == "stability":
+        return 0.7 * relevance + 0.3 * novelty
+    if preference == "risk":
+        return 0.6 * relevance + 0.4 * novelty
+    return 0.6 * relevance + 0.4 * novelty
+
+
+def _open_ended_deep(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_ids: set[str],
+    history_lines: list[str],
+    state: ThoughtState | None = None,
+) -> str:
+    state = state or ThoughtState()
+    if not fact_ids:
+        return _ensure_scores("I don't have enough data to answer that.")
+    state.total_steps = 6
+    state.update("planning", step=1)
+    analysis = _interpret_open_question(prompt, fact_pack=fact_pack, history_lines=history_lines)
+    state.update("planning", step=1, note=str(analysis.get("focus") or ""))
+
+    candidates = _select_insights(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state)
+    state.update("verifying", step=3)
+    filtered: list[dict[str, Any]] = []
+    for cand in candidates:
+        cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else []
+        if cites and not all(cite in fact_ids for cite in cites):
+            continue
+        filtered.append(cand)
+    if not filtered:
+        filtered = candidates
+
+    preference = analysis.get("preference", "balanced")
+    ranked = sorted(filtered, key=lambda item: _score_insight(item, preference), reverse=True)
+    top = ranked[:2]
+    state.update("synthesizing", step=4)
+    synth_prompt = (
+        "Use the question, fact pack, and selected insights to craft a concise answer. "
+        "Write 2-4 sentences. Explain why the selected insights stand out. "
+        "If the question is subjective, include a light opinion grounded in facts. "
+        "Avoid repeating the same observation as the last response if possible. "
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n"
+        f"Question: {prompt}\n"
+        f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n"
+        f"Selected: {json.dumps(top, ensure_ascii=False)}"
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    reply = _ollama_call(
+        ("deep", "open"),
+        synth_prompt,
+        context=context,
+        use_history=False,
+        system_override=_open_ended_system(),
+    )
+    state.update("done", step=6)
+    return _ensure_scores(reply)
+
+
+def open_ended_answer(
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]],
+    history_lines: list[str],
+    mode: str,
+    state: ThoughtState | None = None,
+) -> str:
+    lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if not lines:
+        return _ensure_scores("I don't have enough data to answer that.")
+    fact_pack = _fact_pack_text(lines)
+    fact_ids = {f"F{i+1}" for i in range(len(lines))}
+    if mode == "fast":
+        return _open_ended_fast(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state)
+    return _open_ended_deep(prompt, fact_pack=fact_pack, fact_ids=fact_ids, history_lines=history_lines, state=state)
+
+
+def _non_cluster_reply(prompt: str) -> str:
+    return _ensure_scores(
+        "I focus on the Atlas/Othrys cluster and don't have enough data to answer that."
+    )
+
+
 # Internal HTTP endpoint for cluster answers (website uses this).
 class _AtlasbotHandler(BaseHTTPRequestHandler):
     server_version = "AtlasbotHTTP/1.0"
@@ -2466,6 +2773,9 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             self._write_json(400, {"error": "missing_prompt"})
             return
         cleaned = _strip_bot_mention(prompt)
+        mode = str(payload.get("mode") or "fast").lower()
+        if mode not in ("fast", "deep"):
+            mode = "fast"
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
@@ -2491,34 +2801,30 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             )
         fallback = "I don't have enough data to answer that."
         if cluster_query:
-            facts_answer = cluster_answer(
-                cleaned,
-                inventory=inventory,
-                snapshot=snapshot,
-                workloads=workloads,
-                history_lines=history_lines,
-            )
             open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned)
             if open_ended:
-                llm_context = _append_history_context(context, history_lines)
-                answer = ollama_reply(
-                    ("http", "internal"),
+                answer = open_ended_answer(
                     cleaned,
-                    context=llm_context,
-                    fallback=facts_answer or fallback,
-                    use_history=False,
+                    inventory=inventory,
+                    snapshot=snapshot,
+                    workloads=workloads,
+                    history_lines=history_lines,
+                    mode=mode,
+                    state=None,
                 )
             else:
-                answer = facts_answer or fallback
+                answer = (
+                    cluster_answer(
+                        cleaned,
+                        inventory=inventory,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                        history_lines=history_lines,
+                    )
+                    or fallback
+                )
         else:
-            llm_prompt = cleaned
-            answer = ollama_reply(
-                ("http", "internal"),
-                llm_prompt,
-                context=context,
-                fallback=fallback,
-                use_history=False,
-            )
+            answer = _non_cluster_reply(cleaned)
         self._write_json(200, {"answer": answer})
 
 
@@ -2760,8 +3066,15 @@ def knowledge_summary(prompt: str, inventory: list[dict[str, Any]]) -> str:
     summary = "\n".join(parts).strip()
     return _format_confidence(summary, "medium") if summary else ""
 
-def _ollama_call(hist_key, prompt: str, *, context: str, use_history: bool = True) -> str:
-    system = (
+def _ollama_call(
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    use_history: bool = True,
+    system_override: str | None = None,
+) -> str:
+    system = system_override or (
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
         "Be helpful, direct, and concise. "
         "Use the provided context and facts as your source of truth. "
@@ -2877,6 +3190,47 @@ def ollama_reply_with_thinking(
     thread.join(timeout=1)
     return result["reply"] or fallback or "Model backend is busy. Try again in a moment."
 
+
+def open_ended_with_thinking(
+    token: str,
+    room: str,
+    prompt: str,
+    *,
+    inventory: list[dict[str, Any]],
+    snapshot: dict[str, Any] | None,
+    workloads: list[dict[str, Any]],
+    history_lines: list[str],
+    mode: str,
+) -> str:
+    result: dict[str, str] = {"reply": ""}
+    done = threading.Event()
+    total_steps = 2 if mode == "fast" else 6
+    state = ThoughtState(total_steps=total_steps)
+
+    def worker():
+        result["reply"] = open_ended_answer(
+            prompt,
+            inventory=inventory,
+            snapshot=snapshot,
+            workloads=workloads,
+            history_lines=history_lines,
+            mode=mode,
+            state=state,
+        )
+        done.set()
+
+    thread = threading.Thread(target=worker, daemon=True)
+    thread.start()
+    if not done.wait(2.0):
+        send_msg(token, room, "Thinking…")
+        heartbeat = max(10, THINKING_INTERVAL_SEC)
+        next_heartbeat = time.monotonic() + heartbeat
+        while not done.wait(max(0, next_heartbeat - time.monotonic())):
+            send_msg(token, room, state.status_line())
+            next_heartbeat += heartbeat
+    thread.join(timeout=1)
+    return result["reply"] or "Model backend is busy. Try again in a moment."
+
 def sync_loop(token: str, room_id: str):
     since = None
     try:
@@ -2931,6 +3285,7 @@ def sync_loop(token: str, room_id: str):
 
                 cleaned_body = _strip_bot_mention(body)
                 lower_body = cleaned_body.lower()
+                mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
 
                 # Only do live cluster introspection in DMs.
                 allow_tools = is_dm
@@ -2984,39 +3339,34 @@ def sync_loop(token: str, room_id: str):
                 fallback = "I don't have enough data to answer that."
 
                 if cluster_query:
-                    facts_answer = cluster_answer(
-                        cleaned_body,
-                        inventory=inventory,
-                        snapshot=snapshot,
-                        workloads=workloads,
-                        history_lines=history[hist_key],
-                    )
                     open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body)
                     if open_ended:
-                        llm_context = _append_history_context(context, history[hist_key])
-                        reply = ollama_reply_with_thinking(
+                        reply = open_ended_with_thinking(
                             token,
                             rid,
-                            hist_key,
                             cleaned_body,
-                            context=llm_context,
-                            fallback=facts_answer or fallback,
-                            use_history=False,
+                            inventory=inventory,
+                            snapshot=snapshot,
+                            workloads=workloads,
+                            history_lines=history[hist_key],
+                            mode=mode if mode in ("fast", "deep") else "deep",
                         )
                     else:
-                        reply = facts_answer or fallback
+                        reply = (
+                            cluster_answer(
+                                cleaned_body,
+                                inventory=inventory,
+                                snapshot=snapshot,
+                                workloads=workloads,
+                                history_lines=history[hist_key],
+                            )
+                            or fallback
+                        )
                 else:
-                    llm_prompt = cleaned_body
-                    reply = ollama_reply_with_thinking(
-                        token,
-                        rid,
-                        hist_key,
-                        llm_prompt,
-                        context=context,
-                        fallback=fallback,
-                        use_history=False,
-                    )
+                    reply = _non_cluster_reply(cleaned_body)
                 send_msg(token, rid, reply)
+                history[hist_key].append(f"Atlas: {reply}")
+                history[hist_key] = history[hist_key][-80:]
 
 def login_with_retry():
     last_err = None

From 2cab1d2f280d4306f4d34d4b04562a1af3ea5be6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:09:48 -0300
Subject: [PATCH 359/416] atlasbot: guard open-ended LLM calls

---
 services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index aa7e6148..47458ea3 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2530,6 +2530,26 @@ def _open_ended_system() -> str:
     )
 
 
+def _ollama_call_safe(
+    hist_key,
+    prompt: str,
+    *,
+    context: str,
+    fallback: str,
+    system_override: str | None = None,
+) -> str:
+    try:
+        return _ollama_call(
+            hist_key,
+            prompt,
+            context=context,
+            use_history=False,
+            system_override=system_override,
+        )
+    except Exception:
+        return fallback
+
+
 def _candidate_note(candidate: dict[str, Any]) -> str:
     claim = str(candidate.get("claim") or candidate.get("summary") or "")
     return claim[:160] + ("…" if len(claim) > 160 else "")
@@ -2569,11 +2589,11 @@ def _open_ended_fast(
         f"Question: {prompt}"
     )
     context = _append_history_context(fact_pack, history_lines)
-    reply = _ollama_call(
+    reply = _ollama_call_safe(
         ("fast", "open"),
         synthesis_prompt,
         context=context,
-        use_history=False,
+        fallback="I don't have enough data to answer that.",
         system_override=_open_ended_system(),
     )
     return _ensure_scores(reply)
@@ -2690,11 +2710,11 @@ def _open_ended_deep(
         f"Selected: {json.dumps(top, ensure_ascii=False)}"
     )
     context = _append_history_context(fact_pack, history_lines)
-    reply = _ollama_call(
+    reply = _ollama_call_safe(
         ("deep", "open"),
         synth_prompt,
         context=context,
-        use_history=False,
+        fallback="I don't have enough data to answer that.",
         system_override=_open_ended_system(),
     )
     state.update("done", step=6)

From 9c042c78361fe9f7d13aa4ccf6da059b771c05f8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:11:58 -0300
Subject: [PATCH 360/416] atlasbot: bump rollout checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index cc628dd9..97567eb6 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-70
+        checksum/atlasbot-configmap: manual-atlasbot-71
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 056f512d6769b7980aeab77fd0ddc698f33137a7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:16:47 -0300
Subject: [PATCH 361/416] atlasbot: add model fallback and rollout

---
 services/comms/atlasbot-deployment.yaml |  4 +++-
 services/comms/scripts/atlasbot/bot.py  | 24 +++++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 97567eb6..7414f1e0 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-71
+        checksum/atlasbot-configmap: manual-atlasbot-72
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -83,6 +83,8 @@ spec:
               value: http://ollama.ai.svc.cluster.local:11434
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct
+            - name: OLLAMA_FALLBACK_MODEL
+              value: qwen2.5:14b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC
               value: "600"
             - name: ATLASBOT_THINKING_INTERVAL_SEC
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 47458ea3..2c93b759 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -17,6 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev"
 
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
+FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
 OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
 ATLASBOT_HTTP_PORT = int(os.environ.get("ATLASBOT_HTTP_PORT", "8090"))
@@ -3133,14 +3134,23 @@ def _ollama_call(
     if lock:
         lock.acquire()
     try:
-        with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
-            data = json.loads(resp.read().decode())
-            msg = data.get("message") if isinstance(data, dict) else None
-            if isinstance(msg, dict):
-                raw_reply = msg.get("content")
+        try:
+            with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+                data = json.loads(resp.read().decode())
+        except error.HTTPError as exc:
+            if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
+                payload["model"] = FALLBACK_MODEL
+                r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
+                with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+                    data = json.loads(resp.read().decode())
             else:
-                raw_reply = data.get("response") or data.get("reply") or data
-            reply = _normalize_reply(raw_reply) or "I'm here to help."
+                raise
+        msg = data.get("message") if isinstance(data, dict) else None
+        if isinstance(msg, dict):
+            raw_reply = msg.get("content")
+        else:
+            raw_reply = data.get("response") or data.get("reply") or data
+        reply = _normalize_reply(raw_reply) or "I'm here to help."
         if use_history:
             history[hist_key].append(f"Atlas: {reply}")
         return reply

From 1f6bbceb2446856fc2f9ffc19ac188b1d2129900 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:27:19 -0300
Subject: [PATCH 362/416] atlasbot: improve metric parsing and cluster intent

---
 services/comms/scripts/atlasbot/bot.py | 48 +++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 2c93b759..b9bc0e64 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -190,6 +190,8 @@ _INSIGHT_HINT_WORDS = {
     "surprising",
     "weird",
     "odd",
+    "unusual",
+    "outlier",
     "fun",
     "cool",
     "unique",
@@ -540,6 +542,13 @@ def _detect_operation(q: str) -> str | None:
 
 def _detect_metric(q: str) -> str | None:
     tokens = set(_tokens(q))
+    expanded: set[str] = set(tokens)
+    for token in list(tokens):
+        for part in re.split(r"[-_]", token):
+            part = part.strip()
+            if len(part) >= 2:
+                expanded.add(part)
+    tokens = expanded
     for metric, phrases in METRIC_HINTS.items():
         for phrase in phrases:
             if " " in phrase:
@@ -1271,6 +1280,19 @@ def snapshot_metric_answer(
         pending = metrics.get("pods_pending")
         failed = metrics.get("pods_failed")
         succeeded = metrics.get("pods_succeeded")
+        status_terms = ("running", "pending", "failed", "succeeded", "completed")
+        if sum(1 for term in status_terms if term in q) > 1:
+            parts = []
+            if running is not None:
+                parts.append(f"running {running:.0f}")
+            if pending is not None:
+                parts.append(f"pending {pending:.0f}")
+            if failed is not None:
+                parts.append(f"failed {failed:.0f}")
+            if succeeded is not None:
+                parts.append(f"succeeded {succeeded:.0f}")
+            if parts:
+                return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
         if "pending" in q and pending is not None:
             return _format_confidence(f"Pending pods: {pending:.0f}.", "high")
         if "failed" in q and failed is not None:
@@ -1345,7 +1367,17 @@ def structured_answer(
         if hw_line:
             return _format_confidence(hw_line, "high")
 
-    if op == "top" and metric is None:
+    if entity == "node" and op == "status" and metric is None:
+        summary = _nodes_summary_line(inventory, snapshot)
+        if summary:
+            return _format_confidence(summary, "high")
+
+    if entity == "node" and metric is None and any(word in q for word in ("hardware", "architecture", "class", "mix")):
+        hw_line = _hardware_mix_line(inventory)
+        if hw_line:
+            return _format_confidence(hw_line, "medium")
+
+    if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")):
         metric = "cpu"
 
     # Metrics-first when a metric or top operation is requested.
@@ -2807,8 +2839,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             inventory=inventory,
             workloads=workloads,
         )
-        cluster_query = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads) or (
-            _is_subjective_query(cleaned) and history_cluster
+        cluster_query = (
+            _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
+            or history_cluster
+            or _knowledge_intent(cleaned)
+            or _is_subjective_query(cleaned)
         )
         context = ""
         if cluster_query:
@@ -3347,8 +3382,11 @@ def sync_loop(token: str, room_id: str):
                     inventory=inventory,
                     workloads=workloads,
                 )
-                cluster_query = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads) or (
-                    _is_subjective_query(cleaned_body) and history_cluster
+                cluster_query = (
+                    _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
+                    or history_cluster
+                    or _knowledge_intent(cleaned_body)
+                    or _is_subjective_query(cleaned_body)
                 )
                 context = ""
                 if cluster_query:

From fdf4896f7c10037cfc90d565bb8d107ca04794b1 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:27:52 -0300
Subject: [PATCH 363/416] atlasbot: roll pod after metric parsing update

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7414f1e0..4e27b5a7 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-72
+        checksum/atlasbot-configmap: manual-atlasbot-73
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From b4f5fbeb2b3d3b6c50c7c5150637600efe7e4325 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:43:37 -0300
Subject: [PATCH 364/416] monitoring: unify gpu namespace usage

---
 scripts/dashboards_render_atlas.py            | 47 ++++++++++++++-----
 services/monitoring/dashboards/atlas-gpu.json |  6 +--
 .../monitoring/dashboards/atlas-overview.json |  2 +-
 .../monitoring/grafana-dashboard-gpu.yaml     |  6 +--
 .../grafana-dashboard-overview.yaml           |  2 +-
 5 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 445de94b..2e5c73b6 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -208,32 +208,53 @@ def namespace_ram_raw(scope_var):
 
 
 def namespace_gpu_usage_instant(scope_var):
-    dcgm = f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
-    jetson = jetson_gpu_usage_by_namespace(scope_var)
-    merged = (
-        f'label_replace({dcgm}, "source", "dcgm", "", "") '
-        f'or label_replace({jetson}, "source", "jetson", "", "")'
-    )
-    return f"sum by (namespace) ({merged})"
+    return gpu_usage_by_namespace(scope_var)
 
 
 def jetson_gpu_util_by_node():
     return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
 
 
-def jetson_gpu_util_by_hostname():
+def dcgm_gpu_util_by_node():
+    dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
+    dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
     return (
-        'label_replace(max by (node) (jetson_gr3d_freq_percent{node!=""}), '
-        '"Hostname", "$1", "node", "(.*)")'
+        "avg by (node) ("
+        f"{dcgm_ns} * on(namespace,pod) group_left(node) "
+        'kube_pod_info{namespace="monitoring"}'
+        ")"
     )
 
 
-def jetson_gpu_requests(scope_var):
+def gpu_util_by_node():
+    return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
+
+
+def gpu_util_by_hostname():
+    return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
+
+
+def gpu_node_labels():
+    return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
+
+
+def gpu_requests_by_namespace_node(scope_var):
     return (
         "sum by (namespace,node) ("
         f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
         "* on(namespace,pod) group_left(node) kube_pod_info "
-        '* on(node) group_left(label_jetson) kube_node_labels{label_jetson="true"}'
+        f"* on(node) group_left() {gpu_node_labels()}"
+        ")"
+    )
+
+
+def gpu_usage_by_namespace(scope_var):
+    requests_by_ns = gpu_requests_by_namespace_node(scope_var)
+    total_by_node = f"sum by (node) ({requests_by_ns})"
+    return (
+        "sum by (namespace) ("
+        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
+        f"* on(node) group_left() {gpu_util_by_node()}"
         ")"
     )
 
@@ -2695,7 +2716,7 @@ def build_gpu_dashboard():
         timeseries_panel(
             3,
             "GPU Util by Node",
-            f'(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{{pod!=""}})) or ({jetson_gpu_util_by_hostname()})',
+            gpu_util_by_hostname(),
             {"h": 8, "w": 12, "x": 0, "y": 8},
             unit="percent",
             legend="{{Hostname}}",
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index 132f2766..8542c5e7 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -89,7 +89,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
+          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -126,7 +126,7 @@
       },
       "targets": [
         {
-          "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
+          "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")",
           "refId": "A",
           "legendFormat": "{{Hostname}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index b212c8cd..31b78674 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1901,7 +1901,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index 55f63e84..8d3a3dd5 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -98,7 +98,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))",
+              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -135,7 +135,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})) or (label_replace(max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\"))",
+              "expr": "label_replace(avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}), \"Hostname\", \"$1\", \"node\", \"(.*)\")",
               "refId": "A",
               "legendFormat": "{{Hostname}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index a8990024..2a7cc2b7 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1910,7 +1910,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) / clamp_min((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) (label_replace(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace), \"source\", \"dcgm\", \"\", \"\") or label_replace(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left(label_jetson) kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() max by (node) (jetson_gr3d_freq_percent{node!=\"\"})), \"source\", \"jetson\", \"\", \"\"))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From a255c60aed6d900500a397a03834cf0fef7a1fa3 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:46:58 -0300
Subject: [PATCH 365/416] monitoring: fix gpu idle label

---
 scripts/dashboards_render_atlas.py                  | 4 ++--
 services/monitoring/dashboards/atlas-gpu.json       | 4 ++--
 services/monitoring/dashboards/atlas-overview.json  | 2 +-
 services/monitoring/grafana-dashboard-gpu.yaml      | 4 ++--
 services/monitoring/grafana-dashboard-overview.yaml | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index 2e5c73b6..5db798d7 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -243,7 +243,7 @@ def gpu_requests_by_namespace_node(scope_var):
         "sum by (namespace,node) ("
         f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
         "* on(namespace,pod) group_left(node) kube_pod_info "
-        f"* on(node) group_left() {gpu_node_labels()}"
+        f"* on(node) group_left() ({gpu_node_labels()})"
         ")"
     )
 
@@ -254,7 +254,7 @@ def gpu_usage_by_namespace(scope_var):
     return (
         "sum by (namespace) ("
         f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
-        f"* on(node) group_left() {gpu_util_by_node()}"
+        f"* on(node) group_left() ({gpu_util_by_node()})"
         ")"
     )
 
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
index 8542c5e7..6f993d9b 100644
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -20,7 +20,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
@@ -89,7 +89,7 @@
       },
       "targets": [
         {
-          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))",
+          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 31b78674..1f8635bc 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1901,7 +1901,7 @@
       },
       "targets": [
         {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
           "refId": "A",
           "legendFormat": "{{namespace}}"
         }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
index 8d3a3dd5..34079636 100644
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -29,7 +29,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
@@ -98,7 +98,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))",
+              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index 2a7cc2b7..fdfe1a70 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1910,7 +1910,7 @@ data:
           },
           "targets": [
             {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"})), 1) * on(node) group_left() avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"}))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
               "refId": "A",
               "legendFormat": "{{namespace}}"
             }

From ee13e8da30edb98cd85f36f0e01c8a98070d0c0d Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 21:52:07 -0300
Subject: [PATCH 366/416] atlasbot: refine open-ended reasoning

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 410 ++++++++++++++++++++++--
 2 files changed, 378 insertions(+), 34 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 4e27b5a7..5e5bc05d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-73
+        checksum/atlasbot-configmap: manual-atlasbot-74
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index b9bc0e64..01762934 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -138,6 +138,7 @@ CLUSTER_HINT_WORDS = {
     "cluster",
     "k8s",
     "kubernetes",
+    "health",
     "node",
     "nodes",
     "hardware",
@@ -211,6 +212,7 @@ _OVERVIEW_HINT_WORDS = {
     "explain",
     "tell me about",
     "what do you know",
+    "health",
 }
 
 _OLLAMA_LOCK = threading.Lock()
@@ -1220,6 +1222,8 @@ def snapshot_metric_answer(
     q = normalize_query(prompt)
     metric = _detect_metric(q)
     op = _detect_operation(q)
+    if op == "list" and metric in {"cpu", "ram", "net", "io"}:
+        op = "top"
     include_hw, exclude_hw = _detect_hardware_filters(q)
     nodes_in_query = _extract_titan_nodes(q)
     only_workers = "worker" in q or "workers" in q
@@ -1340,6 +1344,8 @@ def structured_answer(
     tokens = _tokens(q)
     op = _detect_operation(q)
     metric = _detect_metric(q)
+    if op == "list" and metric in {"cpu", "ram", "net", "io"}:
+        op = "top"
     entity = _detect_entity(q)
     include_hw, exclude_hw = _detect_hardware_filters(q)
     nodes_in_query = _extract_titan_nodes(q)
@@ -1646,6 +1652,37 @@ def _is_insight_query(query: str) -> bool:
     return False
 
 
+_FOLLOWUP_HINTS = (
+    "what about",
+    "how about",
+    "and what",
+    "and how",
+    "tell me more",
+    "anything else",
+    "something else",
+    "that one",
+    "those",
+    "them",
+    "it",
+    "this",
+    "that",
+    "else",
+    "another",
+    "again",
+)
+
+
+def _is_followup_query(query: str) -> bool:
+    q = normalize_query(query)
+    if not q:
+        return False
+    if any(hint in q for hint in _FOLLOWUP_HINTS):
+        return True
+    if len(q.split()) <= 3 and not any(word in q for word in _INSIGHT_HINT_WORDS):
+        return True
+    return False
+
+
 def _is_subjective_query(query: str) -> bool:
     q = normalize_query(query)
     if not q:
@@ -2541,6 +2578,12 @@ def _fact_pack_lines(
         if not trimmed or trimmed.lower().startswith("facts"):
             continue
         lines.append(trimmed)
+    if _knowledge_intent(prompt) or _doc_intent(prompt) or _is_overview_query(prompt):
+        kb_titles = kb_retrieve_titles(prompt, limit=4)
+        if kb_titles:
+            for kb_line in kb_titles.splitlines():
+                if kb_line.strip():
+                    lines.append(kb_line.strip())
     return lines
 
 
@@ -2549,12 +2592,194 @@ def _fact_pack_text(lines: list[str]) -> str:
     return "Fact pack:\n" + "\n".join(labeled)
 
 
+_ALLOWED_INSIGHT_TAGS = {
+    "availability",
+    "architecture",
+    "database",
+    "hardware",
+    "inventory",
+    "node_detail",
+    "os",
+    "pods",
+    "utilization",
+    "workloads",
+    "workers",
+}
+
+_DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"}
+_INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"}
+
+
+def _fact_line_tags(line: str) -> set[str]:
+    text = (line or "").lower()
+    tags: set[str] = set()
+    if any(key in text for key in ("nodes_total", "ready", "not_ready", "workers_ready", "workers_not_ready")):
+        tags.add("availability")
+    if "nodes_by_arch" in text or "arch " in text or "architecture" in text:
+        tags.add("architecture")
+    if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")):
+        tags.update({"hardware", "inventory"})
+    if "control_plane_nodes" in text or "worker_nodes" in text:
+        tags.add("inventory")
+    if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")):
+        tags.add("utilization")
+    if "postgres_" in text or "postgres connections" in text:
+        tags.add("database")
+    if "pods_" in text or "pod phases" in text:
+        tags.add("pods")
+    if "workloads" in text or "primary_node" in text:
+        tags.add("workloads")
+    if "node_details" in text:
+        tags.add("node_detail")
+    if "os mix" in text or "os" in text:
+        tags.add("os")
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _fact_pack_meta(lines: list[str]) -> dict[str, dict[str, Any]]:
+    meta: dict[str, dict[str, Any]] = {}
+    for idx, line in enumerate(lines):
+        fid = f"F{idx + 1}"
+        tags = sorted(_fact_line_tags(line))
+        meta[fid] = {"tags": tags}
+    return meta
+
+
+def _history_tags(history_lines: list[str]) -> set[str]:
+    tags: set[str] = set()
+    for line in history_lines[-6:]:
+        tags.update(_fact_line_tags(line))
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _seed_insights(
+    lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    *,
+    limit: int = 6,
+) -> list[dict[str, Any]]:
+    priority = [
+        "utilization",
+        "database",
+        "pods",
+        "workloads",
+        "availability",
+        "hardware",
+        "architecture",
+        "inventory",
+    ]
+    seeds: list[dict[str, Any]] = []
+    used_tags: set[str] = set()
+    for tag in priority:
+        for idx, line in enumerate(lines):
+            fid = f"F{idx + 1}"
+            tags = set(fact_meta.get(fid, {}).get("tags") or [])
+            if tag not in tags or fid in {s["fact_ids"][0] for s in seeds}:
+                continue
+            summary = line.lstrip("- ").strip()
+            seeds.append(
+                {
+                    "summary": summary,
+                    "fact_ids": [fid],
+                    "relevance": 0.5,
+                    "novelty": 0.5,
+                    "rationale": "seeded from fact pack",
+                    "tags": sorted(tags),
+                }
+            )
+            used_tags.update(tags)
+            if len(seeds) >= limit:
+                return seeds
+    return seeds
+
+
+def _insight_tags(insight: dict[str, Any], fact_meta: dict[str, dict[str, Any]]) -> set[str]:
+    tags: set[str] = set()
+    for fid in insight.get("fact_ids") if isinstance(insight.get("fact_ids"), list) else []:
+        tags.update(fact_meta.get(fid, {}).get("tags") or [])
+    raw_tags = insight.get("tags") if isinstance(insight.get("tags"), list) else []
+    tags.update(t for t in raw_tags if isinstance(t, str))
+    summary = insight.get("summary") or insight.get("claim") or ""
+    if isinstance(summary, str):
+        tags.update(_fact_line_tags(summary))
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _insight_score(
+    insight: dict[str, Any],
+    *,
+    preference: str,
+    prefer_tags: set[str],
+    avoid_tags: set[str],
+    history_tags: set[str],
+    fact_meta: dict[str, dict[str, Any]],
+) -> float:
+    base = _score_insight(insight, preference)
+    tags = _insight_tags(insight, fact_meta)
+    if prefer_tags and tags:
+        base += 0.15 * len(tags & prefer_tags)
+    if avoid_tags and tags:
+        base -= 0.12 * len(tags & avoid_tags)
+    if history_tags and tags:
+        base -= 0.08 * len(tags & history_tags)
+    if preference == "novelty":
+        if tags & _DYNAMIC_TAGS:
+            base += 0.12
+        if tags & _INVENTORY_TAGS:
+            base -= 0.08
+    return base
+
+
+def _select_diverse_insights(
+    candidates: list[dict[str, Any]],
+    *,
+    preference: str,
+    prefer_tags: set[str],
+    avoid_tags: set[str],
+    history_tags: set[str],
+    fact_meta: dict[str, dict[str, Any]],
+    count: int = 2,
+) -> list[dict[str, Any]]:
+    scored: list[tuple[float, dict[str, Any]]] = []
+    for item in candidates:
+        tags = _insight_tags(item, fact_meta)
+        item["tags"] = sorted(tags)
+        score = _insight_score(
+            item,
+            preference=preference,
+            prefer_tags=prefer_tags,
+            avoid_tags=avoid_tags,
+            history_tags=history_tags,
+            fact_meta=fact_meta,
+        )
+        scored.append((score, item))
+    scored.sort(key=lambda pair: pair[0], reverse=True)
+    picked: list[dict[str, Any]] = []
+    used_tags: set[str] = set()
+    for _, item in scored:
+        tags = set(item.get("tags") or [])
+        if used_tags and tags and tags <= used_tags and len(picked) < count:
+            continue
+        picked.append(item)
+        used_tags.update(tags)
+        if len(picked) >= count:
+            break
+    if len(picked) < count:
+        for _, item in scored:
+            if item in picked:
+                continue
+            picked.append(item)
+            if len(picked) >= count:
+                break
+    return picked
+
+
 def _open_ended_system() -> str:
     return (
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
         "Use ONLY the provided fact pack and recent chat as your evidence. "
         "You may draw light inferences if you label them as such. "
-        "Write concise, human sentences, not a list. "
+        "Write concise, human sentences with a helpful, calm tone (not a list). "
         "If the question is subjective, share a light opinion grounded in facts. "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible. "
@@ -2608,18 +2833,52 @@ def _open_ended_fast(
     *,
     fact_pack: str,
     history_lines: list[str],
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    tags_available: set[str],
+    history_tags: set[str],
     state: ThoughtState | None = None,
 ) -> str:
     if state:
-        state.update("synthesizing", step=2)
+        state.update("planning", step=1)
+    analysis = _interpret_open_question(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        tags_available=tags_available,
+        avoid_tags=history_tags,
+        state=state,
+    )
+    candidates = _select_insights(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        state=state or ThoughtState(),
+        analysis=analysis,
+        fact_lines=fact_lines,
+        fact_meta=fact_meta,
+        avoid_tags=history_tags,
+    )
+    prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)}
+    selected = _select_diverse_insights(
+        candidates,
+        preference=analysis.get("preference", "balanced"),
+        prefer_tags=prefer_tags,
+        avoid_tags=history_tags,
+        history_tags=history_tags,
+        fact_meta=fact_meta,
+        count=2,
+    )
+    if state:
+        state.update("synthesizing", step=3)
     synthesis_prompt = (
-        "You are given a question and a fact pack. "
-        "Answer in 2-4 sentences, using only facts from the pack. "
-        "Pick one or two facts that best fit the question and explain why they matter. "
-        "If the question is subjective, add a light opinion grounded in those facts. "
-        "Do not list raw facts; speak naturally. "
+        "Use the question, fact pack, and selected insights to answer in 2-4 sentences. "
+        "Speak naturally, not as a list. "
+        "If the question is subjective, add a light opinion grounded in facts. "
+        "Avoid repeating the exact same observation as the most recent response if possible. "
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n"
-        f"Question: {prompt}"
+        f"Question: {prompt}\n"
+        f"Selected: {json.dumps(selected, ensure_ascii=False)}"
     )
     context = _append_history_context(fact_pack, history_lines)
     reply = _ollama_call_safe(
@@ -2637,23 +2896,36 @@ def _interpret_open_question(
     *,
     fact_pack: str,
     history_lines: list[str],
+    tags_available: set[str],
+    avoid_tags: set[str],
+    state: ThoughtState | None = None,
 ) -> dict[str, Any]:
+    tags_list = ", ".join(sorted(tags_available)) if tags_available else "none"
+    avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
     prompt_text = (
         "Analyze the question against the fact pack. "
         "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\","
-        "\"notes\":\"...\"}. "
+        "\"tags\":[\"...\"] ,\"notes\":\"...\"}. "
+        "If the question implies interesting/unique/unconventional/cool, set preference to novelty "
+        "and prefer dynamic tags (utilization/pods/database/availability) when possible. "
+        f"Use only these tags if relevant: {tags_list}. Avoid tags: {avoid_list}. "
         "Use only the fact pack."
     )
     context = _append_history_context(fact_pack, history_lines)
     analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
     if not isinstance(analysis, dict):
-        return {"focus": "cluster snapshot", "preference": "balanced", "notes": ""}
+        analysis = {"focus": "cluster snapshot", "preference": "balanced", "notes": "", "tags": []}
     preference = analysis.get("preference") or "balanced"
     if preference not in ("balanced", "novelty", "utilization", "stability", "risk"):
         preference = "balanced"
     analysis["preference"] = preference
     analysis.setdefault("focus", "cluster snapshot")
     analysis.setdefault("notes", "")
+    tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else []
+    clean_tags = {t for t in tags if isinstance(t, str)}
+    analysis["tags"] = sorted(clean_tags & tags_available)
+    if state:
+        state.update("planning", step=1, note=str(analysis.get("focus") or ""))
     return analysis
 
 
@@ -2663,27 +2935,41 @@ def _select_insights(
     fact_pack: str,
     history_lines: list[str],
     state: ThoughtState,
+    analysis: dict[str, Any],
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    avoid_tags: set[str],
 ) -> list[dict[str, Any]]:
+    preferred_tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else []
+    prefer_list = ", ".join(sorted({t for t in preferred_tags if isinstance(t, str)}))
+    avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
+    available_list = ", ".join(sorted({t for t in _ALLOWED_INSIGHT_TAGS}))
     insight_prompt = (
         "From the fact pack, select 3-5 candidate insights that could answer the question. "
         "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"],"
-        "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\"}]}. "
-        "Use only the fact pack."
+        "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\",\"tags\":[\"...\"]}]}. "
+        f"Available tags: {available_list}. Prefer tags: {prefer_list or 'none'}. Avoid tags: {avoid_list}. "
+        "Use only the fact pack and provided tags."
     )
     state.update("drafting candidates", step=2)
     context = _append_history_context(fact_pack, history_lines)
     result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context)
     insights = result.get("insights") if isinstance(result, dict) else None
     if not isinstance(insights, list):
-        return []
+        insights = []
     cleaned: list[dict[str, Any]] = []
     for item in insights:
         if not isinstance(item, dict):
             continue
         if not item.get("summary") or not item.get("fact_ids"):
             continue
+        tags = _insight_tags(item, fact_meta)
+        item["tags"] = sorted(tags)
         cleaned.append(item)
         state.update("drafting candidates", step=2, note=_candidate_note(item))
+    seeds = _seed_insights(fact_lines, fact_meta)
+    for seed in seeds:
+        cleaned.append(seed)
     return cleaned
 
 
@@ -2707,18 +2993,36 @@ def _open_ended_deep(
     fact_pack: str,
     fact_ids: set[str],
     history_lines: list[str],
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    tags_available: set[str],
+    history_tags: set[str],
     state: ThoughtState | None = None,
 ) -> str:
     state = state or ThoughtState()
     if not fact_ids:
         return _ensure_scores("I don't have enough data to answer that.")
-    state.total_steps = 6
-    state.update("planning", step=1)
-    analysis = _interpret_open_question(prompt, fact_pack=fact_pack, history_lines=history_lines)
-    state.update("planning", step=1, note=str(analysis.get("focus") or ""))
+    state.total_steps = 7
+    analysis = _interpret_open_question(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        tags_available=tags_available,
+        avoid_tags=history_tags,
+        state=state,
+    )
 
-    candidates = _select_insights(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state)
-    state.update("verifying", step=3)
+    candidates = _select_insights(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        state=state,
+        analysis=analysis,
+        fact_lines=fact_lines,
+        fact_meta=fact_meta,
+        avoid_tags=history_tags,
+    )
+    state.update("verifying", step=3, note="scoring insights")
     filtered: list[dict[str, Any]] = []
     for cand in candidates:
         cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else []
@@ -2729,9 +3033,17 @@ def _open_ended_deep(
         filtered = candidates
 
     preference = analysis.get("preference", "balanced")
-    ranked = sorted(filtered, key=lambda item: _score_insight(item, preference), reverse=True)
-    top = ranked[:2]
-    state.update("synthesizing", step=4)
+    prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)}
+    top = _select_diverse_insights(
+        filtered,
+        preference=preference,
+        prefer_tags=prefer_tags,
+        avoid_tags=history_tags,
+        history_tags=history_tags,
+        fact_meta=fact_meta,
+        count=2,
+    )
+    state.update("synthesizing", step=4, note="composing response")
     synth_prompt = (
         "Use the question, fact pack, and selected insights to craft a concise answer. "
         "Write 2-4 sentences. Explain why the selected insights stand out. "
@@ -2740,6 +3052,7 @@ def _open_ended_deep(
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n"
         f"Question: {prompt}\n"
         f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n"
+        f"Recent tags: {', '.join(sorted(history_tags)) if history_tags else 'none'}\n"
         f"Selected: {json.dumps(top, ensure_ascii=False)}"
     )
     context = _append_history_context(fact_pack, history_lines)
@@ -2750,7 +3063,7 @@ def _open_ended_deep(
         fallback="I don't have enough data to answer that.",
         system_override=_open_ended_system(),
     )
-    state.update("done", step=6)
+    state.update("done", step=7)
     return _ensure_scores(reply)
 
 
@@ -2769,9 +3082,31 @@ def open_ended_answer(
         return _ensure_scores("I don't have enough data to answer that.")
     fact_pack = _fact_pack_text(lines)
     fact_ids = {f"F{i+1}" for i in range(len(lines))}
+    fact_meta = _fact_pack_meta(lines)
+    tags_available = {tag for entry in fact_meta.values() for tag in entry.get("tags", [])}
+    history_tags = _history_tags(history_lines)
     if mode == "fast":
-        return _open_ended_fast(prompt, fact_pack=fact_pack, history_lines=history_lines, state=state)
-    return _open_ended_deep(prompt, fact_pack=fact_pack, fact_ids=fact_ids, history_lines=history_lines, state=state)
+        return _open_ended_fast(
+            prompt,
+            fact_pack=fact_pack,
+            history_lines=history_lines,
+            fact_lines=lines,
+            fact_meta=fact_meta,
+            tags_available=tags_available,
+            history_tags=history_tags,
+            state=state,
+        )
+    return _open_ended_deep(
+        prompt,
+        fact_pack=fact_pack,
+        fact_ids=fact_ids,
+        history_lines=history_lines,
+        fact_lines=lines,
+        fact_meta=fact_meta,
+        tags_available=tags_available,
+        history_tags=history_tags,
+        state=state,
+    )
 
 
 def _non_cluster_reply(prompt: str) -> str:
@@ -2826,9 +3161,9 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             self._write_json(400, {"error": "missing_prompt"})
             return
         cleaned = _strip_bot_mention(prompt)
-        mode = str(payload.get("mode") or "fast").lower()
+        mode = str(payload.get("mode") or "deep").lower()
         if mode not in ("fast", "deep"):
-            mode = "fast"
+            mode = "deep"
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
         workloads = _snapshot_workloads(snapshot)
@@ -2839,11 +3174,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             inventory=inventory,
             workloads=workloads,
         )
+        followup = _is_followup_query(cleaned)
         cluster_query = (
             _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
-            or history_cluster
             or _knowledge_intent(cleaned)
             or _is_subjective_query(cleaned)
+            or (history_cluster and followup)
         )
         context = ""
         if cluster_query:
@@ -2857,7 +3193,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             )
         fallback = "I don't have enough data to answer that."
         if cluster_query:
-            open_ended = _is_subjective_query(cleaned) or _knowledge_intent(cleaned)
+            open_ended = (
+                _is_subjective_query(cleaned)
+                or _knowledge_intent(cleaned)
+                or _is_overview_query(cleaned)
+            )
             if open_ended:
                 answer = open_ended_answer(
                     cleaned,
@@ -3068,7 +3408,6 @@ def _knowledge_intent(prompt: str) -> bool:
             "summary",
             "describe",
             "explain",
-            "what is",
         )
     )
 
@@ -3269,7 +3608,7 @@ def open_ended_with_thinking(
 ) -> str:
     result: dict[str, str] = {"reply": ""}
     done = threading.Event()
-    total_steps = 2 if mode == "fast" else 6
+    total_steps = 4 if mode == "fast" else 7
     state = ThoughtState(total_steps=total_steps)
 
     def worker():
@@ -3382,11 +3721,12 @@ def sync_loop(token: str, room_id: str):
                     inventory=inventory,
                     workloads=workloads,
                 )
+                followup = _is_followup_query(cleaned_body)
                 cluster_query = (
                     _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
-                    or history_cluster
                     or _knowledge_intent(cleaned_body)
                     or _is_subjective_query(cleaned_body)
+                    or (history_cluster and followup)
                 )
                 context = ""
                 if cluster_query:
@@ -3407,7 +3747,11 @@ def sync_loop(token: str, room_id: str):
                 fallback = "I don't have enough data to answer that."
 
                 if cluster_query:
-                    open_ended = _is_subjective_query(cleaned_body) or _knowledge_intent(cleaned_body)
+                    open_ended = (
+                        _is_subjective_query(cleaned_body)
+                        or _knowledge_intent(cleaned_body)
+                        or _is_overview_query(cleaned_body)
+                    )
                     if open_ended:
                         reply = open_ended_with_thinking(
                             token,

From 7b43e8654f5a820009e1a1723922fe2f159e9467 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:00:19 -0300
Subject: [PATCH 367/416] monitoring: send grafana alerts via postmark

---
 services/monitoring/helmrelease.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 8e225d49..6185e595 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -339,7 +339,7 @@ spec:
       GF_AUTH_ANONYMOUS_ORG_NAME: "Overview"
       GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer"
       GF_SMTP_ENABLED: "true"
-      GF_SMTP_HOST: "mail.bstein.dev:587"
+      GF_SMTP_HOST: "smtp.postmarkapp.com:587"
       GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
       GF_SMTP_FROM_NAME: "Atlas Grafana"
       GRAFANA_ALERT_EMAILS: "brad@bstein.dev"

From 1113b1625e6ecfd8dbabcfca70844de3710a390f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:22:50 -0300
Subject: [PATCH 368/416] atlasbot: overhaul open-ended reasoning

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 697 +++++++++---------------
 2 files changed, 253 insertions(+), 446 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 5e5bc05d..17e2cb2f 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-74
+        checksum/atlasbot-configmap: manual-atlasbot-75
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 01762934..06685217 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -198,6 +198,8 @@ _INSIGHT_HINT_WORDS = {
     "unique",
     "notable",
     "coolest",
+    "risk",
+    "risky",
     "favorite",
     "favourite",
     "trivia",
@@ -1641,17 +1643,6 @@ def _hottest_summary_line(metrics: dict[str, Any]) -> str:
     return "Hot spots: " + "; ".join(parts) + "."
 
 
-def _is_insight_query(query: str) -> bool:
-    q = normalize_query(query)
-    if not q:
-        return False
-    if any(word in q for word in _INSIGHT_HINT_WORDS):
-        return True
-    if "most" in q and any(word in q for word in ("unusual", "odd", "weird", "unconventional")):
-        return True
-    return False
-
-
 _FOLLOWUP_HINTS = (
     "what about",
     "how about",
@@ -1724,198 +1715,6 @@ def _doc_intent(query: str) -> bool:
     )
 
 
-def _insight_candidates(
-    inventory: list[dict[str, Any]],
-    snapshot: dict[str, Any] | None,
-) -> list[tuple[str, str, str]]:
-    metrics = _snapshot_metrics(snapshot)
-    candidates: list[tuple[str, str, str]] = []
-
-    nodes_line = _nodes_summary_line(inventory, snapshot)
-    if nodes_line and "not ready" in nodes_line.lower():
-        candidates.append(("availability", nodes_line, "high"))
-
-    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
-    if hottest:
-        def _hot_node(entry: dict[str, Any]) -> str:
-            if not isinstance(entry, dict):
-                return ""
-            return (
-                entry.get("node")
-                or entry.get("label")
-                or (entry.get("metric") or {}).get("node")
-                or ""
-            )
-
-        cpu = hottest.get("cpu") if isinstance(hottest.get("cpu"), dict) else {}
-        cpu_node = _hot_node(cpu)
-        if cpu_node and cpu.get("value") is not None:
-            value_fmt = _format_metric_value(str(cpu.get("value")), percent=True)
-            candidates.append(("cpu", f"The busiest CPU right now is {cpu_node} at about {value_fmt}.", "high"))
-        ram = hottest.get("ram") if isinstance(hottest.get("ram"), dict) else {}
-        ram_node = _hot_node(ram)
-        if ram_node and ram.get("value") is not None:
-            value_fmt = _format_metric_value(str(ram.get("value")), percent=True)
-            candidates.append(("ram", f"RAM usage peaks on {ram_node} at about {value_fmt}.", "high"))
-
-    postgres_line = _postgres_summary_line(metrics)
-    if postgres_line:
-        candidates.append(("postgres", postgres_line, "high"))
-
-    hardware_insight = _hardware_insight(inventory)
-    if hardware_insight:
-        candidates.append(("hardware", hardware_insight, "medium"))
-
-    pods_line = _pods_summary_line(metrics)
-    if pods_line:
-        candidates.append(("pods", pods_line, "high"))
-
-    return candidates
-
-
-def _hardware_insight(inventory: list[dict[str, Any]]) -> str:
-    if not inventory:
-        return ""
-    groups = _group_nodes(inventory)
-    jetsons = groups.get("jetson") or []
-    rpi5 = groups.get("rpi5") or []
-    rpi4 = groups.get("rpi4") or []
-    amd64 = groups.get("amd64") or []
-    parts: list[str] = []
-    if rpi5:
-        parts.append(f"rpi5={len(rpi5)}")
-    if rpi4:
-        parts.append(f"rpi4={len(rpi4)}")
-    if jetsons:
-        jetson_names = ", ".join(jetsons[:2])
-        parts.append(f"jetson={len(jetsons)} ({jetson_names})")
-    if amd64:
-        parts.append(f"amd64={len(amd64)}")
-    return ", ".join(parts)
-
-
-def _recent_insight_keys(history_lines: list[str]) -> set[str]:
-    used: set[str] = set()
-    for line in history_lines[-10:]:
-        lower = normalize_query(line)
-        if not lower:
-            continue
-        if "postgres" in lower or "connections" in lower:
-            used.add("postgres")
-        if "atlas mixes" in lower or "hardware" in lower or "rpi" in lower or "jetson" in lower:
-            used.add("hardware")
-        if "busiest cpu" in lower or "cpu right now" in lower or "cpu " in lower:
-            used.add("cpu")
-        if "ram usage" in lower or "memory" in lower:
-            used.add("ram")
-        if "pods" in lower:
-            used.add("pods")
-        if "not ready" in lower:
-            used.add("availability")
-    return used
-
-
-def _select_insight(
-    prompt: str,
-    candidates: list[tuple[str, str, str]],
-    *,
-    used_keys: set[str] | None = None,
-) -> tuple[str, str, str] | None:
-    if not candidates:
-        return None
-    used = used_keys or set()
-    q = normalize_query(prompt)
-    prefer_keys: list[str] = []
-    if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
-        prefer_keys.extend(["hardware", "availability"])
-    if any(word in q for word in ("coolest", "favorite", "favourite", "trivia", "fun")):
-        prefer_keys.extend(["hardware", "cpu", "ram"])
-    if "interesting" in q and "most interesting" not in q:
-        prefer_keys.extend(["hardware", "postgres", "cpu", "ram"])
-    avoid_used = any(word in q for word in ("another", "else", "different", "other")) or "most interesting" in q
-    if any(word in q for word in ("another", "else", "different", "other")) and len(candidates) > 1:
-        for candidate in candidates:
-            if candidate[0] not in used:
-                return candidate
-        return candidates[1]
-    if prefer_keys:
-        for prefer in prefer_keys:
-            for key, text, conf in candidates:
-                if key == prefer and (not avoid_used or key not in used):
-                    return key, text, conf
-        for prefer in prefer_keys:
-            for key, text, conf in candidates:
-                if key == prefer:
-                    return key, text, conf
-    if used and avoid_used:
-        for candidate in candidates:
-            if candidate[0] not in used:
-                return candidate
-    return candidates[0]
-
-
-def _format_insight_text(key: str, text: str) -> str:
-    cleaned = text.strip().rstrip(".")
-    if not cleaned:
-        return ""
-    if key == "hardware":
-        counts = (
-            cleaned.replace("Hardware mix includes ", "")
-            .replace("Atlas mixes tiny ", "")
-            .replace("Atlas mixes ", "")
-            .replace("which is unusual for a homelab cluster", "")
-            .strip()
-            .strip(".")
-        )
-        has_jetson = "jetson=" in counts
-        has_amd64 = "amd64=" in counts
-        detail = f"mixed hardware stack ({counts})"
-        if has_jetson and has_amd64:
-            flavor = "It blends low-power Pis with Jetson accelerators and a couple of AMD64 boxes."
-        elif has_jetson:
-            flavor = "It pairs low-power Pis with Jetson accelerators for edge and AI workloads."
-        elif has_amd64:
-            flavor = "It mixes low-power Pis with a couple of heavier AMD64 nodes."
-        else:
-            flavor = "It is a pretty uniform hardware stack, which is rare for a homelab."
-        return f"{detail}. {flavor}"
-    if key == "postgres":
-        detail = cleaned.replace("Postgres is at ", "")
-        return f"Postgres is at {detail}; that feels like healthy, steady load rather than strain."
-    if key == "pods":
-        detail = cleaned.replace("There are ", "")
-        return f"Pods look steady ({detail}); nothing looks stuck or unhealthy."
-    if key == "availability":
-        return cleaned + " That is the kind of stability I like to see."
-    if key in ("cpu", "ram"):
-        suffix = (
-            " If you're chasing hotspots, that's the node I'd watch first."
-            if key == "cpu"
-            else " That box is carrying the heaviest memory load right now."
-        )
-        return cleaned + "." + suffix
-    return cleaned + "."
-
-
-def _insight_prefix(prompt: str) -> str:
-    q = normalize_query(prompt)
-    if "coolest" in q:
-        return "If I had to pick the coolest detail, I'd say "
-    if "favorite" in q or "favourite" in q:
-        return "My favorite detail is "
-    if "trivia" in q:
-        return "A bit of trivia I like: "
-    if "most interesting" in q:
-        return "The most interesting detail to me is "
-    if any(word in q for word in ("another", "else", "different", "other")):
-        return "Another interesting detail: "
-    if any(word in q for word in ("unconventional", "weird", "odd", "unique", "surprising")):
-        return "What stands out to me is that "
-    if any(word in q for word in ("interesting", "notable", "fun", "cool")):
-        return "One thing I'd call out is "
-    return ""
-
-
 def cluster_overview_answer(
     prompt: str,
     *,
@@ -2784,7 +2583,7 @@ def _open_ended_system() -> str:
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible. "
         "Do not invent numbers or facts. "
-        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100)."
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)."
     )
 
 
@@ -2809,263 +2608,284 @@ def _ollama_call_safe(
 
 
 def _candidate_note(candidate: dict[str, Any]) -> str:
-    claim = str(candidate.get("claim") or candidate.get("summary") or "")
+    claim = str(candidate.get("focus") or candidate.get("answer") or "")
     return claim[:160] + ("…" if len(claim) > 160 else "")
 
 
 def _ensure_scores(answer: str) -> str:
     text = answer.strip()
     lines = [line for line in text.splitlines() if line.strip()]
-    has_relevance = any(line.lower().startswith("relevance:") for line in lines)
-    has_satisfaction = any(line.lower().startswith("satisfaction:") for line in lines)
-    has_confidence = any("confidence:" in line.lower() for line in lines)
+    has_relevance = any(line.lower().startswith("relevance") for line in lines)
+    has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines)
+    has_confidence = any(line.lower().startswith("confidence") for line in lines)
+    has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines)
     if not has_confidence:
         lines.append("Confidence: medium")
     if not has_relevance:
         lines.append("Relevance: 70")
     if not has_satisfaction:
         lines.append("Satisfaction: 70")
+    if not has_risk:
+        lines.append("HallucinationRisk: low")
     return "\n".join(lines)
 
 
+def _open_ended_plan(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    count: int,
+    state: ThoughtState | None,
+) -> list[dict[str, Any]]:
+    if state:
+        state.update("planning", step=1, note="mapping angles")
+    count = max(1, count)
+    prompt_text = (
+        "Analyze the question and propose up to "
+        f"{count} distinct answer angles that can be supported by the fact pack. "
+        "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). "
+        "If the question is subjective, propose at least one angle that surfaces a standout detail. "
+        "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
+    angles = result.get("angles") if isinstance(result, dict) else None
+    cleaned: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    if isinstance(angles, list):
+        for item in angles:
+            if not isinstance(item, dict):
+                continue
+            focus = str(item.get("focus") or "").strip()
+            if not focus or focus.lower() in seen:
+                continue
+            seen.add(focus.lower())
+            priority = item.get("priority")
+            if not isinstance(priority, (int, float)):
+                priority = 3
+            cleaned.append(
+                {
+                    "focus": focus,
+                    "reason": str(item.get("reason") or ""),
+                    "priority": int(max(1, min(5, priority))),
+                }
+            )
+    if not cleaned:
+        cleaned = [{"focus": "Direct answer", "reason": "Default fallback", "priority": 3}]
+    cleaned.sort(key=lambda item: item.get("priority", 3), reverse=True)
+    if state:
+        state.update("planning", step=1, note=_candidate_note(cleaned[0]))
+    return cleaned
+
+
+def _normalize_score(value: Any, *, default: int = 60) -> int:
+    if isinstance(value, (int, float)):
+        return int(max(0, min(100, value)))
+    return default
+
+
+def _confidence_score(value: Any) -> int:
+    text = str(value or "").strip().lower()
+    if text.startswith("high"):
+        return 85
+    if text.startswith("low"):
+        return 35
+    return 60
+
+
+def _risk_penalty(value: Any) -> int:
+    text = str(value or "").strip().lower()
+    if text.startswith("high"):
+        return 20
+    if text.startswith("medium"):
+        return 10
+    return 0
+
+
+def _open_ended_candidate(
+    prompt: str,
+    *,
+    focus: str,
+    fact_pack: str,
+    history_lines: list[str],
+    state: ThoughtState | None,
+    step: int,
+) -> dict[str, Any]:
+    if state:
+        state.update("drafting", step=step, note=focus)
+    prompt_text = (
+        "Using ONLY the fact pack, answer the question focusing on this angle: "
+        f"{focus}. "
+        "Write 2-4 sentences in plain prose (not a list). "
+        "If you infer, label it as inference. "
+        "Return JSON: {\"answer\":\"...\",\"confidence\":\"high|medium|low\","
+        "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
+    if not isinstance(result, dict):
+        result = {}
+    answer = str(result.get("answer") or "").strip()
+    if not answer:
+        answer = "I don't have enough data to answer that from the current snapshot."
+    candidate = {
+        "focus": focus,
+        "answer": answer,
+        "confidence": result.get("confidence", "medium"),
+        "relevance": _normalize_score(result.get("relevance"), default=60),
+        "satisfaction": _normalize_score(result.get("satisfaction"), default=60),
+        "risk": result.get("risk", "medium"),
+    }
+    candidate["score"] = _candidate_score(candidate)
+    return candidate
+
+
+def _candidate_score(candidate: dict[str, Any]) -> float:
+    relevance = _normalize_score(candidate.get("relevance"), default=60)
+    satisfaction = _normalize_score(candidate.get("satisfaction"), default=60)
+    confidence = _confidence_score(candidate.get("confidence"))
+    score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2
+    return score - _risk_penalty(candidate.get("risk"))
+
+
+def _select_candidates(candidates: list[dict[str, Any]], *, count: int) -> list[dict[str, Any]]:
+    if not candidates:
+        return []
+    ranked = sorted(candidates, key=lambda item: item.get("score", 0), reverse=True)
+    picked: list[dict[str, Any]] = []
+    seen_focus: set[str] = set()
+    for item in ranked:
+        focus = str(item.get("focus") or "").strip().lower()
+        if focus and focus in seen_focus:
+            continue
+        picked.append(item)
+        if focus:
+            seen_focus.add(focus)
+        if len(picked) >= count:
+            break
+    return picked or ranked[:count]
+
+
+def _open_ended_synthesize(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    candidates: list[dict[str, Any]],
+    state: ThoughtState | None,
+    step: int,
+) -> str:
+    if state:
+        state.update("synthesizing", step=step, note="composing answer")
+    synth_prompt = (
+        "Compose the final answer to the question using the candidate answers below. "
+        "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. "
+        "Use only the fact pack as evidence. "
+        "If you infer, label it as inference. "
+        "Avoid repeating the last response if possible. "
+        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), "
+        "HallucinationRisk (low|medium|high).\n"
+        f"Question: {prompt}\n"
+        f"Candidates: {json.dumps(candidates, ensure_ascii=False)}"
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    reply = _ollama_call_safe(
+        ("open", "synth"),
+        synth_prompt,
+        context=context,
+        fallback="I don't have enough data to answer that.",
+        system_override=_open_ended_system(),
+    )
+    return _ensure_scores(reply)
+
+
+def _open_ended_multi(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    mode: str,
+    state: ThoughtState | None = None,
+) -> str:
+    angle_count = 2 if mode == "fast" else 4
+    total_steps = 1 + angle_count + 2
+    if state:
+        state.total_steps = total_steps
+    angles = _open_ended_plan(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        count=angle_count,
+        state=state,
+    )
+    candidates: list[dict[str, Any]] = []
+    step = 2
+    for angle in angles[:angle_count]:
+        candidates.append(
+            _open_ended_candidate(
+                prompt,
+                focus=str(angle.get("focus") or "Direct answer"),
+                fact_pack=fact_pack,
+                history_lines=history_lines,
+                state=state,
+                step=step,
+            )
+        )
+        step += 1
+    if state:
+        state.update("evaluating", step=step, note="ranking candidates")
+    selected = _select_candidates(candidates, count=1 if mode == "fast" else 2)
+    step += 1
+    reply = _open_ended_synthesize(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        candidates=selected or candidates,
+        state=state,
+        step=step,
+    )
+    if state:
+        state.update("done", step=total_steps)
+    return reply
+
+
+def _open_ended_total_steps(mode: str) -> int:
+    angle_count = 2 if mode == "fast" else 4
+    return 1 + angle_count + 2
+
+
 def _open_ended_fast(
     prompt: str,
     *,
     fact_pack: str,
     history_lines: list[str],
-    fact_lines: list[str],
-    fact_meta: dict[str, dict[str, Any]],
-    tags_available: set[str],
-    history_tags: set[str],
     state: ThoughtState | None = None,
 ) -> str:
-    if state:
-        state.update("planning", step=1)
-    analysis = _interpret_open_question(
+    return _open_ended_multi(
         prompt,
         fact_pack=fact_pack,
         history_lines=history_lines,
-        tags_available=tags_available,
-        avoid_tags=history_tags,
+        mode="fast",
         state=state,
     )
-    candidates = _select_insights(
-        prompt,
-        fact_pack=fact_pack,
-        history_lines=history_lines,
-        state=state or ThoughtState(),
-        analysis=analysis,
-        fact_lines=fact_lines,
-        fact_meta=fact_meta,
-        avoid_tags=history_tags,
-    )
-    prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)}
-    selected = _select_diverse_insights(
-        candidates,
-        preference=analysis.get("preference", "balanced"),
-        prefer_tags=prefer_tags,
-        avoid_tags=history_tags,
-        history_tags=history_tags,
-        fact_meta=fact_meta,
-        count=2,
-    )
-    if state:
-        state.update("synthesizing", step=3)
-    synthesis_prompt = (
-        "Use the question, fact pack, and selected insights to answer in 2-4 sentences. "
-        "Speak naturally, not as a list. "
-        "If the question is subjective, add a light opinion grounded in facts. "
-        "Avoid repeating the exact same observation as the most recent response if possible. "
-        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n"
-        f"Question: {prompt}\n"
-        f"Selected: {json.dumps(selected, ensure_ascii=False)}"
-    )
-    context = _append_history_context(fact_pack, history_lines)
-    reply = _ollama_call_safe(
-        ("fast", "open"),
-        synthesis_prompt,
-        context=context,
-        fallback="I don't have enough data to answer that.",
-        system_override=_open_ended_system(),
-    )
-    return _ensure_scores(reply)
-
-
-def _interpret_open_question(
-    prompt: str,
-    *,
-    fact_pack: str,
-    history_lines: list[str],
-    tags_available: set[str],
-    avoid_tags: set[str],
-    state: ThoughtState | None = None,
-) -> dict[str, Any]:
-    tags_list = ", ".join(sorted(tags_available)) if tags_available else "none"
-    avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
-    prompt_text = (
-        "Analyze the question against the fact pack. "
-        "Return JSON: {\"focus\":\"...\",\"preference\":\"balanced|novelty|utilization|stability|risk\","
-        "\"tags\":[\"...\"] ,\"notes\":\"...\"}. "
-        "If the question implies interesting/unique/unconventional/cool, set preference to novelty "
-        "and prefer dynamic tags (utilization/pods/database/availability) when possible. "
-        f"Use only these tags if relevant: {tags_list}. Avoid tags: {avoid_list}. "
-        "Use only the fact pack."
-    )
-    context = _append_history_context(fact_pack, history_lines)
-    analysis = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
-    if not isinstance(analysis, dict):
-        analysis = {"focus": "cluster snapshot", "preference": "balanced", "notes": "", "tags": []}
-    preference = analysis.get("preference") or "balanced"
-    if preference not in ("balanced", "novelty", "utilization", "stability", "risk"):
-        preference = "balanced"
-    analysis["preference"] = preference
-    analysis.setdefault("focus", "cluster snapshot")
-    analysis.setdefault("notes", "")
-    tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else []
-    clean_tags = {t for t in tags if isinstance(t, str)}
-    analysis["tags"] = sorted(clean_tags & tags_available)
-    if state:
-        state.update("planning", step=1, note=str(analysis.get("focus") or ""))
-    return analysis
-
-
-def _select_insights(
-    prompt: str,
-    *,
-    fact_pack: str,
-    history_lines: list[str],
-    state: ThoughtState,
-    analysis: dict[str, Any],
-    fact_lines: list[str],
-    fact_meta: dict[str, dict[str, Any]],
-    avoid_tags: set[str],
-) -> list[dict[str, Any]]:
-    preferred_tags = analysis.get("tags") if isinstance(analysis.get("tags"), list) else []
-    prefer_list = ", ".join(sorted({t for t in preferred_tags if isinstance(t, str)}))
-    avoid_list = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
-    available_list = ", ".join(sorted({t for t in _ALLOWED_INSIGHT_TAGS}))
-    insight_prompt = (
-        "From the fact pack, select 3-5 candidate insights that could answer the question. "
-        "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"],"
-        "\"relevance\":0-1,\"novelty\":0-1,\"rationale\":\"...\",\"tags\":[\"...\"]}]}. "
-        f"Available tags: {available_list}. Prefer tags: {prefer_list or 'none'}. Avoid tags: {avoid_list}. "
-        "Use only the fact pack and provided tags."
-    )
-    state.update("drafting candidates", step=2)
-    context = _append_history_context(fact_pack, history_lines)
-    result = _ollama_json_call(insight_prompt + f" Question: {prompt}", context=context)
-    insights = result.get("insights") if isinstance(result, dict) else None
-    if not isinstance(insights, list):
-        insights = []
-    cleaned: list[dict[str, Any]] = []
-    for item in insights:
-        if not isinstance(item, dict):
-            continue
-        if not item.get("summary") or not item.get("fact_ids"):
-            continue
-        tags = _insight_tags(item, fact_meta)
-        item["tags"] = sorted(tags)
-        cleaned.append(item)
-        state.update("drafting candidates", step=2, note=_candidate_note(item))
-    seeds = _seed_insights(fact_lines, fact_meta)
-    for seed in seeds:
-        cleaned.append(seed)
-    return cleaned
-
-
-def _score_insight(insight: dict[str, Any], preference: str) -> float:
-    relevance = insight.get("relevance") if isinstance(insight.get("relevance"), (int, float)) else 0.0
-    novelty = insight.get("novelty") if isinstance(insight.get("novelty"), (int, float)) else 0.0
-    if preference == "novelty":
-        return 0.4 * relevance + 0.6 * novelty
-    if preference == "utilization":
-        return 0.7 * relevance + 0.3 * novelty
-    if preference == "stability":
-        return 0.7 * relevance + 0.3 * novelty
-    if preference == "risk":
-        return 0.6 * relevance + 0.4 * novelty
-    return 0.6 * relevance + 0.4 * novelty
 
 
 def _open_ended_deep(
     prompt: str,
     *,
     fact_pack: str,
-    fact_ids: set[str],
     history_lines: list[str],
-    fact_lines: list[str],
-    fact_meta: dict[str, dict[str, Any]],
-    tags_available: set[str],
-    history_tags: set[str],
     state: ThoughtState | None = None,
 ) -> str:
-    state = state or ThoughtState()
-    if not fact_ids:
-        return _ensure_scores("I don't have enough data to answer that.")
-    state.total_steps = 7
-    analysis = _interpret_open_question(
+    return _open_ended_multi(
         prompt,
         fact_pack=fact_pack,
         history_lines=history_lines,
-        tags_available=tags_available,
-        avoid_tags=history_tags,
+        mode="deep",
         state=state,
     )
 
-    candidates = _select_insights(
-        prompt,
-        fact_pack=fact_pack,
-        history_lines=history_lines,
-        state=state,
-        analysis=analysis,
-        fact_lines=fact_lines,
-        fact_meta=fact_meta,
-        avoid_tags=history_tags,
-    )
-    state.update("verifying", step=3, note="scoring insights")
-    filtered: list[dict[str, Any]] = []
-    for cand in candidates:
-        cites = cand.get("fact_ids") if isinstance(cand.get("fact_ids"), list) else []
-        if cites and not all(cite in fact_ids for cite in cites):
-            continue
-        filtered.append(cand)
-    if not filtered:
-        filtered = candidates
-
-    preference = analysis.get("preference", "balanced")
-    prefer_tags = {t for t in analysis.get("tags", []) if isinstance(t, str)}
-    top = _select_diverse_insights(
-        filtered,
-        preference=preference,
-        prefer_tags=prefer_tags,
-        avoid_tags=history_tags,
-        history_tags=history_tags,
-        fact_meta=fact_meta,
-        count=2,
-    )
-    state.update("synthesizing", step=4, note="composing response")
-    synth_prompt = (
-        "Use the question, fact pack, and selected insights to craft a concise answer. "
-        "Write 2-4 sentences. Explain why the selected insights stand out. "
-        "If the question is subjective, include a light opinion grounded in facts. "
-        "Avoid repeating the same observation as the last response if possible. "
-        "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100).\n"
-        f"Question: {prompt}\n"
-        f"Interpretation: {json.dumps(analysis, ensure_ascii=False)}\n"
-        f"Recent tags: {', '.join(sorted(history_tags)) if history_tags else 'none'}\n"
-        f"Selected: {json.dumps(top, ensure_ascii=False)}"
-    )
-    context = _append_history_context(fact_pack, history_lines)
-    reply = _ollama_call_safe(
-        ("deep", "open"),
-        synth_prompt,
-        context=context,
-        fallback="I don't have enough data to answer that.",
-        system_override=_open_ended_system(),
-    )
-    state.update("done", step=7)
-    return _ensure_scores(reply)
-
 
 def open_ended_answer(
     prompt: str,
@@ -3081,30 +2901,17 @@ def open_ended_answer(
     if not lines:
         return _ensure_scores("I don't have enough data to answer that.")
     fact_pack = _fact_pack_text(lines)
-    fact_ids = {f"F{i+1}" for i in range(len(lines))}
-    fact_meta = _fact_pack_meta(lines)
-    tags_available = {tag for entry in fact_meta.values() for tag in entry.get("tags", [])}
-    history_tags = _history_tags(history_lines)
     if mode == "fast":
         return _open_ended_fast(
             prompt,
             fact_pack=fact_pack,
             history_lines=history_lines,
-            fact_lines=lines,
-            fact_meta=fact_meta,
-            tags_available=tags_available,
-            history_tags=history_tags,
             state=state,
         )
     return _open_ended_deep(
         prompt,
         fact_pack=fact_pack,
-        fact_ids=fact_ids,
         history_lines=history_lines,
-        fact_lines=lines,
-        fact_meta=fact_meta,
-        tags_available=tags_available,
-        history_tags=history_tags,
         state=state,
     )
 
@@ -3175,12 +2982,12 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             workloads=workloads,
         )
         followup = _is_followup_query(cleaned)
-        cluster_query = (
-            _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
-            or _knowledge_intent(cleaned)
-            or _is_subjective_query(cleaned)
-            or (history_cluster and followup)
-        )
+        cleaned_q = normalize_query(cleaned)
+        cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
+        subjective = _is_subjective_query(cleaned)
+        followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS)
+        contextual = history_cluster and (followup or followup_affinity)
+        cluster_query = cluster_affinity or contextual
         context = ""
         if cluster_query:
             context = build_context(
@@ -3608,7 +3415,7 @@ def open_ended_with_thinking(
 ) -> str:
     result: dict[str, str] = {"reply": ""}
     done = threading.Event()
-    total_steps = 4 if mode == "fast" else 7
+    total_steps = _open_ended_total_steps(mode)
     state = ThoughtState(total_steps=total_steps)
 
     def worker():
@@ -3722,12 +3529,12 @@ def sync_loop(token: str, room_id: str):
                     workloads=workloads,
                 )
                 followup = _is_followup_query(cleaned_body)
-                cluster_query = (
-                    _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
-                    or _knowledge_intent(cleaned_body)
-                    or _is_subjective_query(cleaned_body)
-                    or (history_cluster and followup)
-                )
+                cleaned_q = normalize_query(cleaned_body)
+                cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
+                subjective = _is_subjective_query(cleaned_body)
+                followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS)
+                contextual = history_cluster and (followup or followup_affinity)
+                cluster_query = cluster_affinity or contextual
                 context = ""
                 if cluster_query:
                     context = build_context(

From 32884e0b7e43cbda1acaa51fe99faf602f100bf1 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:28:37 -0300
Subject: [PATCH 369/416] monitoring: fix grafana smtp from address

---
 services/monitoring/helmrelease.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 6185e595..78eaf3c4 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -340,7 +340,7 @@ spec:
       GF_AUTH_ANONYMOUS_ORG_ROLE: "Viewer"
       GF_SMTP_ENABLED: "true"
       GF_SMTP_HOST: "smtp.postmarkapp.com:587"
-      GF_SMTP_FROM: "no-reply-grafana@bstein.dev"
+      GF_SMTP_FROM_ADDRESS: "no-reply-grafana@bstein.dev"
       GF_SMTP_FROM_NAME: "Atlas Grafana"
       GRAFANA_ALERT_EMAILS: "brad@bstein.dev"
       GF_SECURITY_ALLOW_EMBEDDING: "true"

From d1611c4f4f31929aab6f3b32b992e7b01fee5162 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:32:25 -0300
Subject: [PATCH 370/416] atlasbot: fix score formatting

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 17e2cb2f..7ad44d4b 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-75
+        checksum/atlasbot-configmap: manual-atlasbot-76
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 06685217..9ecd06d9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2614,7 +2614,7 @@ def _candidate_note(candidate: dict[str, Any]) -> str:
 
 def _ensure_scores(answer: str) -> str:
     text = answer.strip()
-    lines = [line for line in text.splitlines() if line.strip()]
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
     has_relevance = any(line.lower().startswith("relevance") for line in lines)
     has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines)
     has_confidence = any(line.lower().startswith("confidence") for line in lines)

From 2952b2a7c3448c208e9dff196b39852c8a39801f Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:44:49 -0300
Subject: [PATCH 371/416] atlasbot: refine cluster intent handling

---
 services/comms/scripts/atlasbot/bot.py | 92 ++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 9ecd06d9..f85b81a0 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -152,6 +152,16 @@ CLUSTER_HINT_WORDS = {
     "deployment",
     "daemonset",
     "statefulset",
+    "snapshot",
+    "anomaly",
+    "anomalies",
+    "monitor",
+    "monitoring",
+    "runbook",
+    "runbooks",
+    "documentation",
+    "docs",
+    "playbook",
     "grafana",
     "victoria",
     "prometheus",
@@ -203,6 +213,12 @@ _INSIGHT_HINT_WORDS = {
     "favorite",
     "favourite",
     "trivia",
+    "anomaly",
+    "anomalies",
+    "monitor",
+    "monitoring",
+    "alert",
+    "alerts",
     "stand out",
     "stands out",
 }
@@ -532,7 +548,14 @@ def _humanize_rate(value: str, *, unit: str) -> str:
     return f"{val:.2f} B/s"
 
 def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
-    return any(p in text for p in phrases)
+    for phrase in phrases:
+        if " " in phrase:
+            if phrase in text:
+                return True
+        else:
+            if re.search(rf"\\b{re.escape(phrase)}\\b", text):
+                return True
+    return False
 
 def _detect_operation(q: str) -> str | None:
     if _has_any(q, OPERATION_HINTS["top"]):
@@ -552,6 +575,8 @@ def _detect_metric(q: str) -> str | None:
             part = part.strip()
             if len(part) >= 2:
                 expanded.add(part)
+            if part.endswith("s") and len(part) >= 4:
+                expanded.add(part[:-1])
     tokens = expanded
     for metric, phrases in METRIC_HINTS.items():
         for phrase in phrases:
@@ -565,6 +590,8 @@ def _detect_metric(q: str) -> str | None:
 def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
     include: set[str] = set()
     exclude: set[str] = set()
+    if any(term in q for term in ("gpu", "gpus", "accelerator", "accelerators", "cuda", "nvidia")):
+        include.add("jetson")
     rpi_specific = any(
         phrase in q
         for phrase in (
@@ -1287,6 +1314,10 @@ def snapshot_metric_answer(
         failed = metrics.get("pods_failed")
         succeeded = metrics.get("pods_succeeded")
         status_terms = ("running", "pending", "failed", "succeeded", "completed")
+        if "not running" in q or "not in running" in q or "non running" in q:
+            parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))]
+            if parts:
+                return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high")
         if sum(1 for term in status_terms if term in q) > 1:
             parts = []
             if running is not None:
@@ -1350,6 +1381,8 @@ def structured_answer(
         op = "top"
     entity = _detect_entity(q)
     include_hw, exclude_hw = _detect_hardware_filters(q)
+    if entity is None and (include_hw or exclude_hw):
+        entity = "node"
     nodes_in_query = _extract_titan_nodes(q)
     only_workers = "worker" in q or "workers" in q
     role_filters = _detect_role_filters(q)
@@ -1385,6 +1418,20 @@ def structured_answer(
         if hw_line:
             return _format_confidence(hw_line, "medium")
 
+    if (
+        entity == "node"
+        and any(term in q for term in ("arm64", "amd64"))
+        and any(term in q for term in ("mostly", "majority", "more"))
+    ):
+        arm64_count = len([n for n in inventory if n.get("arch") == "arm64"])
+        amd64_count = len([n for n in inventory if n.get("arch") == "amd64"])
+        if arm64_count or amd64_count:
+            majority = "arm64" if arm64_count >= amd64_count else "amd64"
+            return _format_confidence(
+                f"arm64 nodes: {arm64_count}, amd64 nodes: {amd64_count}. Mostly {majority}.",
+                "high",
+            )
+
     if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")):
         metric = "cpu"
 
@@ -1491,6 +1538,27 @@ def structured_answer(
             )
 
     if op == "count":
+        if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q):
+            total_workers = _inventory_filter(
+                inventory,
+                include_hw=include_hw,
+                exclude_hw=exclude_hw,
+                only_workers=True,
+                only_ready=None,
+                nodes_in_query=nodes_in_query,
+            )
+            ready_workers = _inventory_filter(
+                inventory,
+                include_hw=include_hw,
+                exclude_hw=exclude_hw,
+                only_workers=True,
+                only_ready=True,
+                nodes_in_query=nodes_in_query,
+            )
+            return _format_confidence(
+                f"Worker nodes ready: {len(ready_workers)} / {len(total_workers)} total.",
+                "high",
+            )
         if expected_workers and ("expected" in q or "should" in q):
             missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
             msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
@@ -1711,6 +1779,15 @@ def _doc_intent(query: str) -> bool:
             "how to",
             "instructions",
             "playbook",
+            "next step",
+            "next steps",
+            "what should",
+            "what do i",
+            "what to do",
+            "troubleshoot",
+            "triage",
+            "recover",
+            "remediate",
         )
     )
 
@@ -2615,10 +2692,13 @@ def _candidate_note(candidate: dict[str, Any]) -> str:
 def _ensure_scores(answer: str) -> str:
     text = answer.strip()
     lines = [line.strip() for line in text.splitlines() if line.strip()]
-    has_relevance = any(line.lower().startswith("relevance") for line in lines)
-    has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines)
-    has_confidence = any(line.lower().startswith("confidence") for line in lines)
-    has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines)
+    def _score_key(line: str) -> str:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        return cleaned.lower()
+    has_relevance = any(_score_key(line).startswith("relevance") for line in lines)
+    has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines)
+    has_confidence = any(_score_key(line).startswith("confidence") for line in lines)
+    has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines)
     if not has_confidence:
         lines.append("Confidence: medium")
     if not has_relevance:
@@ -3004,6 +3084,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 _is_subjective_query(cleaned)
                 or _knowledge_intent(cleaned)
                 or _is_overview_query(cleaned)
+                or _doc_intent(cleaned)
             )
             if open_ended:
                 answer = open_ended_answer(
@@ -3558,6 +3639,7 @@ def sync_loop(token: str, room_id: str):
                         _is_subjective_query(cleaned_body)
                         or _knowledge_intent(cleaned_body)
                         or _is_overview_query(cleaned_body)
+                        or _doc_intent(cleaned_body)
                     )
                     if open_ended:
                         reply = open_ended_with_thinking(

From 269b5bdca80952dcb70e340deb99df8150967688 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:45:17 -0300
Subject: [PATCH 372/416] chore: bump atlasbot config checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7ad44d4b..01aebef8 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-76
+        checksum/atlasbot-configmap: manual-atlasbot-77
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From c95a580f84ff8d616f7e76a121082a335aed8953 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:55:00 -0300
Subject: [PATCH 373/416] atlasbot: tighten scoring and readiness logic

---
 services/comms/scripts/atlasbot/bot.py | 97 +++++++++++++++++++++-----
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index f85b81a0..29f53751 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1297,7 +1297,7 @@ def snapshot_metric_answer(
         parts: list[str] = []
         if used is not None and max_conn is not None:
             free = max_conn - used
-            if any(word in q for word in ("free", "available", "remaining")):
+            if any(word in q for word in ("free", "available", "remaining", "remain", "left")):
                 parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max ({free:.0f} free).")
             else:
                 parts.append(f"Postgres connections: {used:.0f} used / {max_conn:.0f} max.")
@@ -1387,13 +1387,23 @@ def structured_answer(
     only_workers = "worker" in q or "workers" in q
     role_filters = _detect_role_filters(q)
     only_ready: bool | None = None
-    if "not ready" in q or "unready" in q or "down" in q or "missing" in q:
+    if (
+        "not ready" in q
+        or "notready" in q
+        or "not-ready" in q
+        or "unready" in q
+        or "down" in q
+        or "missing" in q
+    ):
         only_ready = False
     elif "ready" in q:
         only_ready = True
 
     if entity == "node" and only_ready is not None and op != "count":
         op = "status"
+    if entity == "node" and only_ready is not None and op == "count":
+        if not any(term in q for term in ("how many", "count", "number")):
+            op = "status"
 
     if not op and entity == "node":
         op = "list" if (include_hw or exclude_hw or nodes_in_query) else "count"
@@ -2692,22 +2702,67 @@ def _candidate_note(candidate: dict[str, Any]) -> str:
 def _ensure_scores(answer: str) -> str:
     text = answer.strip()
     lines = [line.strip() for line in text.splitlines() if line.strip()]
+    score_map: dict[str, str] = {}
+    body_lines: list[str] = []
+
     def _score_key(line: str) -> str:
         cleaned = line.strip().lstrip("-•* ").strip()
         return cleaned.lower()
-    has_relevance = any(_score_key(line).startswith("relevance") for line in lines)
-    has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines)
-    has_confidence = any(_score_key(line).startswith("confidence") for line in lines)
-    has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines)
-    if not has_confidence:
-        lines.append("Confidence: medium")
-    if not has_relevance:
-        lines.append("Relevance: 70")
-    if not has_satisfaction:
-        lines.append("Satisfaction: 70")
-    if not has_risk:
-        lines.append("HallucinationRisk: low")
-    return "\n".join(lines)
+
+    def _extract_value(line: str) -> str:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        if ":" in cleaned:
+            return cleaned.split(":", 1)[1].strip()
+        parts = cleaned.split()
+        return parts[1] if len(parts) > 1 else ""
+
+    def _record_score(key: str, value: str):
+        if not value:
+            return
+        score_map.setdefault(key, value)
+
+    for line in lines:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        lowered = cleaned.lower()
+        if lowered.startswith("confidence,") or (
+            "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered
+        ):
+            for key in ("confidence", "relevance", "satisfaction"):
+                match = re.search(rf"{key}\\s*[:=]?\\s*(\\d{{1,3}}|high|medium|low)", lowered)
+                if match:
+                    _record_score(key, match.group(1))
+            risk_match = re.search(r"hallucination\\s*risk\\s*[:=]?\\s*(low|medium|high)", lowered)
+            if risk_match:
+                _record_score("hallucinationrisk", risk_match.group(1))
+            continue
+        if lowered.startswith("confidence"):
+            _record_score("confidence", _extract_value(cleaned))
+            continue
+        if lowered.startswith("relevance"):
+            _record_score("relevance", _extract_value(cleaned))
+            continue
+        if lowered.startswith("satisfaction"):
+            _record_score("satisfaction", _extract_value(cleaned))
+            continue
+        if lowered.replace(" ", "").startswith("hallucinationrisk") or lowered.startswith(
+            "hallucination risk"
+        ):
+            _record_score("hallucinationrisk", _extract_value(cleaned))
+            continue
+        body_lines.append(line)
+
+    confidence = score_map.get("confidence") or "medium"
+    relevance = score_map.get("relevance") or "70"
+    satisfaction = score_map.get("satisfaction") or "70"
+    risk = score_map.get("hallucinationrisk") or "low"
+
+    final_lines = body_lines + [
+        f"Confidence: {confidence}",
+        f"Relevance: {relevance}",
+        f"Satisfaction: {satisfaction}",
+        f"HallucinationRisk: {risk}",
+    ]
+    return "\n".join(final_lines)
 
 
 def _open_ended_plan(
@@ -2799,7 +2854,8 @@ def _open_ended_candidate(
         f"{focus}. "
         "Write 2-4 sentences in plain prose (not a list). "
         "If you infer, label it as inference. "
-        "Return JSON: {\"answer\":\"...\",\"confidence\":\"high|medium|low\","
+        "List which fact pack IDs you used. "
+        "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\","
         "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}."
     )
     context = _append_history_context(fact_pack, history_lines)
@@ -2809,9 +2865,13 @@ def _open_ended_candidate(
     answer = str(result.get("answer") or "").strip()
     if not answer:
         answer = "I don't have enough data to answer that from the current snapshot."
+    facts_used = result.get("facts_used")
+    if not isinstance(facts_used, list):
+        facts_used = []
     candidate = {
         "focus": focus,
         "answer": answer,
+        "facts_used": facts_used,
         "confidence": result.get("confidence", "medium"),
         "relevance": _normalize_score(result.get("relevance"), default=60),
         "satisfaction": _normalize_score(result.get("satisfaction"), default=60),
@@ -2826,6 +2886,8 @@ def _candidate_score(candidate: dict[str, Any]) -> float:
     satisfaction = _normalize_score(candidate.get("satisfaction"), default=60)
     confidence = _confidence_score(candidate.get("confidence"))
     score = relevance * 0.45 + satisfaction * 0.35 + confidence * 0.2
+    if not candidate.get("facts_used"):
+        score -= 5
     return score - _risk_penalty(candidate.get("risk"))
 
 
@@ -2863,6 +2925,9 @@ def _open_ended_synthesize(
         "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. "
         "Use only the fact pack as evidence. "
         "If you infer, label it as inference. "
+        "Do not claim nodes are missing or not ready unless the fact pack explicitly lists "
+        "nodes_not_ready or expected_workers_missing. "
+        "Keep the tone conversational and answer the user's intent directly. "
         "Avoid repeating the last response if possible. "
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), "
         "HallucinationRisk (low|medium|high).\n"

From 413b9eca5d4490b9cee0968410b4d0ed7aea69a4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 22:55:24 -0300
Subject: [PATCH 374/416] chore: bump atlasbot checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 01aebef8..a06e6283 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-77
+        checksum/atlasbot-configmap: manual-atlasbot-78
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 4651133debeb0c5920206f245fb6c9513de966e1 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:01:51 -0300
Subject: [PATCH 375/416] atlasbot: fix word boundary detection

---
 services/comms/scripts/atlasbot/bot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 29f53751..77868f1f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -553,7 +553,7 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
             if phrase in text:
                 return True
         else:
-            if re.search(rf"\\b{re.escape(phrase)}\\b", text):
+            if re.search(rf"\b{re.escape(phrase)}\b", text):
                 return True
     return False
 

From b16f841e9aac45bbe79edd7824042a29690a42d2 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:02:22 -0300
Subject: [PATCH 376/416] chore: bump atlasbot checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index a06e6283..530fb407 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-78
+        checksum/atlasbot-configmap: manual-atlasbot-79
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From d9951083ee24475caab9ab9094f00002f76717a5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:16:53 -0300
Subject: [PATCH 377/416] atlasbot: improve metric detection and counts

---
 services/comms/scripts/atlasbot/bot.py | 81 +++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 13 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 77868f1f..eca5fef9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -120,6 +120,7 @@ OPERATION_HINTS = {
     "count": ("how many", "count", "number", "total"),
     "list": ("list", "which", "what are", "show", "names"),
     "top": ("top", "hottest", "highest", "most", "largest", "max", "maximum", "busiest", "busy"),
+    "bottom": ("lowest", "least", "minimum", "min", "smallest"),
     "status": ("ready", "not ready", "unready", "down", "missing", "status"),
 }
 
@@ -568,6 +569,14 @@ def _detect_operation(q: str) -> str | None:
     return None
 
 def _detect_metric(q: str) -> str | None:
+    q = normalize_query(q)
+    if _has_any(q, ("disk", "storage")):
+        return "io"
+    if _has_any(q, ("io",)) and not _has_any(q, METRIC_HINTS["net"]):
+        return "io"
+    for metric, phrases in METRIC_HINTS.items():
+        if _has_any(q, phrases):
+            return metric
     tokens = set(_tokens(q))
     expanded: set[str] = set(tokens)
     for token in list(tokens):
@@ -1237,6 +1246,34 @@ def _node_usage_top(
     return None
 
 
+def _node_usage_bottom(
+    usage: list[dict[str, Any]],
+    *,
+    allowed_nodes: set[str] | None,
+) -> tuple[str, float] | None:
+    best_node: str | None = None
+    best_val: float | None = None
+    for item in usage:
+        if not isinstance(item, dict):
+            continue
+        node = item.get("node")
+        if not node or not isinstance(node, str):
+            continue
+        if allowed_nodes and node not in allowed_nodes:
+            continue
+        value = item.get("value")
+        try:
+            numeric = float(value)
+        except (TypeError, ValueError):
+            continue
+        if best_val is None or numeric < best_val:
+            best_val = numeric
+            best_node = node
+    if best_node and best_val is not None:
+        return best_node, best_val
+    return None
+
+
 def snapshot_metric_answer(
     prompt: str,
     *,
@@ -1267,18 +1304,20 @@ def snapshot_metric_answer(
     )
     allowed_nodes = {node["name"] for node in filtered} if filtered else None
 
-    if metric in {"cpu", "ram", "net", "io"} and op in {"top", "status", None}:
+    if metric in {"cpu", "ram", "net", "io"} and op in {"top", "bottom", "status", None}:
         usage = metrics.get("node_usage", {}).get(metric, [])
-        top = _node_usage_top(usage, allowed_nodes=allowed_nodes)
-        if top:
-            node, val = top
+        pick = _node_usage_bottom if op == "bottom" else _node_usage_top
+        chosen = pick(usage, allowed_nodes=allowed_nodes)
+        if chosen:
+            node, val = chosen
             percent = metric in {"cpu", "ram"}
             value = _format_metric_value(str(val), percent=percent, rate=metric in {"net", "io"})
             scope = ""
             if include_hw:
                 scope = f" among {' and '.join(sorted(include_hw))}"
-            answer = f"Hottest node{scope}: {node} ({value})."
-            if allowed_nodes and len(allowed_nodes) != len(inventory):
+            label = "Lowest" if op == "bottom" else "Hottest"
+            answer = f"{label} node{scope}: {node} ({value})."
+            if allowed_nodes and len(allowed_nodes) != len(inventory) and op != "bottom":
                 overall = _node_usage_top(usage, allowed_nodes=None)
                 if overall and overall[0] != node:
                     overall_val = _format_metric_value(
@@ -1314,6 +1353,10 @@ def snapshot_metric_answer(
         failed = metrics.get("pods_failed")
         succeeded = metrics.get("pods_succeeded")
         status_terms = ("running", "pending", "failed", "succeeded", "completed")
+        if "total" in q or "sum" in q:
+            values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))]
+            if values:
+                return _format_confidence(f"Total pods: {sum(values):.0f}.", "high")
         if "not running" in q or "not in running" in q or "non running" in q:
             parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))]
             if parts:
@@ -1468,7 +1511,8 @@ def structured_answer(
                 node, val = _primary_series_metric(res)
                 if node and val is not None:
                     percent = _metric_expr_uses_percent(entry)
-                    value_fmt = _format_metric_value(val or "", percent=percent)
+                    rate = metric in {"net", "io"}
+                    value_fmt = _format_metric_value(val or "", percent=percent, rate=rate)
                     metric_label = (metric or "").upper()
                     label = f"{metric_label} node" if metric_label else "node"
                     answer = f"Hottest {label}: {node} ({value_fmt})."
@@ -1495,7 +1539,8 @@ def structured_answer(
                     scoped_node, scoped_val = _primary_series_metric(res)
                     if base_node and scoped_node and base_node != scoped_node:
                         percent = _metric_expr_uses_percent(entry)
-                        base_val_fmt = _format_metric_value(base_val or "", percent=percent)
+                        rate = metric in {"net", "io"}
+                        base_val_fmt = _format_metric_value(base_val or "", percent=percent, rate=rate)
                         overall_note = f" Overall hottest node: {base_node} ({base_val_fmt})."
                     return _format_confidence(f"Among {scope} nodes, {answer}{overall_note}", "high")
                 return _format_confidence(answer, "high")
@@ -1525,9 +1570,14 @@ def structured_answer(
     names = [node["name"] for node in filtered]
 
     if op == "status":
+        scope_label = "nodes"
+        if include_hw:
+            scope_label = f"{' and '.join(sorted(include_hw))} nodes"
+        elif only_workers:
+            scope_label = "worker nodes"
         if "missing" in q and ("ready" in q or "readiness" in q):
             return _format_confidence(
-                "Not ready nodes: " + (", ".join(names) if names else "none") + ".",
+                f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".",
                 "high",
             )
         if "missing" in q and expected_workers:
@@ -1538,16 +1588,21 @@ def structured_answer(
             )
         if only_ready is False:
             return _format_confidence(
-                "Not ready nodes: " + (", ".join(names) if names else "none") + ".",
+                f"Not ready {scope_label}: " + (", ".join(names) if names else "none") + ".",
                 "high",
             )
         if only_ready is True:
             return _format_confidence(
-                f"Ready nodes ({len(names)}): " + (", ".join(names) if names else "none") + ".",
+                f"Ready {scope_label} ({len(names)}): " + (", ".join(names) if names else "none") + ".",
                 "high",
             )
 
     if op == "count":
+        scope_label = "nodes"
+        if include_hw:
+            scope_label = f"{' and '.join(sorted(include_hw))} nodes"
+        elif only_workers:
+            scope_label = "worker nodes"
         if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q):
             total_workers = _inventory_filter(
                 inventory,
@@ -1576,9 +1631,9 @@ def structured_answer(
                 msg += f" Missing: {', '.join(missing)}."
             return _format_confidence(msg, "high")
         if only_ready is True:
-            return _format_confidence(f"Ready nodes: {len(names)}.", "high")
+            return _format_confidence(f"Ready {scope_label}: {len(names)}.", "high")
         if only_ready is False:
-            return _format_confidence(f"Not ready nodes: {len(names)}.", "high")
+            return _format_confidence(f"Not ready {scope_label}: {len(names)}.", "high")
         if not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters):
             return _format_confidence(f"Atlas has {len(names)} nodes.", "high")
         return _format_confidence(f"Matching nodes: {len(names)}.", "high")

From 8a22e8e0d8d3dee24068dbff9f507f4d23da5ac6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:17:23 -0300
Subject: [PATCH 378/416] chore: bump atlasbot checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 530fb407..94eeea70 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-79
+        checksum/atlasbot-configmap: manual-atlasbot-80
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 9a978c5e727b94ccbf81024ba8926f7b5c244353 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:23:42 -0300
Subject: [PATCH 379/416] monitoring: tune cpu and maintenance alerts

---
 services/monitoring/grafana-alerting-config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml
index 8713d3db..d97db150 100644
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@@ -145,7 +145,7 @@ data:
                 model:
                   intervalMs: 60000
                   maxDataPoints: 43200
-                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m]
+                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
                   legendFormat: '{{instance}}'
                   datasource:
                     type: prometheus
@@ -175,9 +175,9 @@ data:
                         type: last
                       type: query
             noDataState: NoData
-            execErrState: Error
+            execErrState: NoData
             annotations:
-              summary: "{{ $labels.instance }} CPU >90% for 10m"
+              summary: "{{ $labels.node }} CPU >90% for 10m"
             labels:
               severity: warning
       - orgId: 1
@@ -297,7 +297,7 @@ data:
                   to: 0
                 datasourceUid: atlas-vm
                 model:
-                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})
+                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
                   intervalMs: 60000
                   maxDataPoints: 43200
                   legendFormat: '{{cronjob}}'

From 35396d19ea338cb4ec88b6b8a22bcb8c5266b8d4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:24:12 -0300
Subject: [PATCH 380/416] atlasbot: fix bottom ops and pod queries

---
 services/comms/scripts/atlasbot/bot.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index eca5fef9..7f22ad57 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -163,6 +163,8 @@ CLUSTER_HINT_WORDS = {
     "documentation",
     "docs",
     "playbook",
+    "utilization",
+    "usage",
     "grafana",
     "victoria",
     "prometheus",
@@ -561,8 +563,10 @@ def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
 def _detect_operation(q: str) -> str | None:
     if _has_any(q, OPERATION_HINTS["top"]):
         return "top"
+    if _has_any(q, OPERATION_HINTS["bottom"]):
+        return "bottom"
     for op, phrases in OPERATION_HINTS.items():
-        if op == "top":
+        if op in ("top", "bottom"):
             continue
         if _has_any(q, phrases):
             return op
@@ -1353,6 +1357,11 @@ def snapshot_metric_answer(
         failed = metrics.get("pods_failed")
         succeeded = metrics.get("pods_succeeded")
         status_terms = ("running", "pending", "failed", "succeeded", "completed")
+        if ("most pods" in q or ("most" in q and "pod" in q and "node" in q)) and not nodes_in_query:
+            return _format_confidence(
+                "I don't have per-node pod counts in the snapshot.",
+                "medium",
+            )
         if "total" in q or "sum" in q:
             values = [v for v in (running, pending, failed, succeeded) if isinstance(v, (int, float))]
             if values:
@@ -1363,13 +1372,13 @@ def snapshot_metric_answer(
                 return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high")
         if sum(1 for term in status_terms if term in q) > 1:
             parts = []
-            if running is not None:
+            if "running" in q and running is not None:
                 parts.append(f"running {running:.0f}")
-            if pending is not None:
+            if "pending" in q and pending is not None:
                 parts.append(f"pending {pending:.0f}")
-            if failed is not None:
+            if "failed" in q and failed is not None:
                 parts.append(f"failed {failed:.0f}")
-            if succeeded is not None:
+            if ("succeeded" in q or "completed" in q) and succeeded is not None:
                 parts.append(f"succeeded {succeeded:.0f}")
             if parts:
                 return _format_confidence(f"Pods: {', '.join(parts)}.", "high")
@@ -1461,7 +1470,12 @@ def structured_answer(
         if hw_line:
             return _format_confidence(hw_line, "high")
 
-    if entity == "node" and op == "status" and metric is None:
+    if (
+        entity == "node"
+        and op == "status"
+        and metric is None
+        and not (include_hw or exclude_hw or nodes_in_query or only_workers or role_filters)
+    ):
         summary = _nodes_summary_line(inventory, snapshot)
         if summary:
             return _format_confidence(summary, "high")

From fa9184bc9107089ff7c949fb8b28558b3b0b9378 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:24:46 -0300
Subject: [PATCH 381/416] chore: bump atlasbot checksum

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 94eeea70..6761287b 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-80
+        checksum/atlasbot-configmap: manual-atlasbot-81
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From a49fa6dd33f9dcab3ac3e4522d6f73870c0609d8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:29:46 -0300
Subject: [PATCH 382/416] monitoring: restart grafana for alerting reload

---
 services/monitoring/helmrelease.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 78eaf3c4..66517389 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -286,6 +286,7 @@ spec:
     podAnnotations:
       vault.hashicorp.com/agent-inject: "true"
       vault.hashicorp.com/role: "monitoring"
+      monitoring.bstein.dev/restart-rev: "1"
       vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
       vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
         {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}

From 35d5d5a1a30774c655f5390337a3b6445fd9e958 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:34:11 -0300
Subject: [PATCH 383/416] monitoring: fix grafana alert exec state

---
 services/monitoring/grafana-alerting-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml
index d97db150..33ac7396 100644
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@@ -175,7 +175,7 @@ data:
                         type: last
                       type: query
             noDataState: NoData
-            execErrState: NoData
+            execErrState: OK
             annotations:
               summary: "{{ $labels.node }} CPU >90% for 10m"
             labels:

From 3bd42c93d63959dd2e2569627cc2d2eb90744f54 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:45:08 -0300
Subject: [PATCH 384/416] atlasbot: overhaul reasoning pipeline

---
 services/comms/atlasbot-deployment.yaml |   6 +-
 services/comms/scripts/atlasbot/bot.py  | 405 +++++++++++++++++++-----
 2 files changed, 336 insertions(+), 75 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 6761287b..b08f20db 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-81
+        checksum/atlasbot-configmap: manual-atlasbot-82
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -83,6 +83,10 @@ spec:
               value: http://ollama.ai.svc.cluster.local:11434
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct
+            - name: ATLASBOT_MODEL_FAST
+              value: qwen2.5:14b-instruct
+            - name: ATLASBOT_MODEL_DEEP
+              value: qwen2.5:14b-instruct
             - name: OLLAMA_FALLBACK_MODEL
               value: qwen2.5:14b-instruct-q4_0
             - name: OLLAMA_TIMEOUT_SEC
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 7f22ad57..7e6341e6 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -17,6 +17,8 @@ ROOM_ALIAS = "#othrys:live.bstein.dev"
 
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
+MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
+MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
 FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
 OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
@@ -372,6 +374,14 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
     return default
 
 
+def _model_for_mode(mode: str) -> str:
+    if mode == "fast" and MODEL_FAST:
+        return MODEL_FAST
+    if mode == "deep" and MODEL_DEEP:
+        return MODEL_DEEP
+    return MODEL
+
+
 # Matrix HTTP helper.
 def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
     url = (base or BASE) + path
@@ -2487,7 +2497,13 @@ class ThoughtState:
         return f"Still thinking ({detail})."
 
 
-def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[str, Any]:
+def _ollama_json_call(
+    prompt: str,
+    *,
+    context: str,
+    retries: int = 2,
+    model: str | None = None,
+) -> dict[str, Any]:
     system = (
         "System: You are Atlas, a reasoning assistant. "
         "Return strict JSON only (no code fences, no trailing commentary). "
@@ -2504,6 +2520,7 @@ def _ollama_json_call(prompt: str, *, context: str, retries: int = 2) -> dict[st
                 context=context,
                 use_history=False,
                 system_override=system,
+                model=model,
             )
             cleaned = _strip_code_fence(raw).strip()
             if cleaned.startswith("{") and cleaned.endswith("}"):
@@ -2547,6 +2564,19 @@ def _fact_pack_text(lines: list[str]) -> str:
     return "Fact pack:\n" + "\n".join(labeled)
 
 
+def _tool_fact_lines(prompt: str, *, allow_tools: bool) -> list[str]:
+    if not allow_tools:
+        return []
+    metrics_context, _ = metrics_query_context(prompt, allow_tools=True)
+    lines: list[str] = []
+    if metrics_context:
+        for line in metrics_context.splitlines():
+            trimmed = line.strip()
+            if trimmed:
+                lines.append(f"tool_metrics: {trimmed}")
+    return lines
+
+
 _ALLOWED_INSIGHT_TAGS = {
     "availability",
     "architecture",
@@ -2607,6 +2637,15 @@ def _history_tags(history_lines: list[str]) -> set[str]:
     return tags & _ALLOWED_INSIGHT_TAGS
 
 
+def _normalize_fraction(value: Any, *, default: float = 0.5) -> float:
+    if isinstance(value, (int, float)):
+        score = float(value)
+        if score > 1:
+            score = score / 100.0
+        return max(0.0, min(1.0, score))
+    return default
+
+
 def _seed_insights(
     lines: list[str],
     fact_meta: dict[str, dict[str, Any]],
@@ -2735,9 +2774,9 @@ def _open_ended_system() -> str:
         "Use ONLY the provided fact pack and recent chat as your evidence. "
         "You may draw light inferences if you label them as such. "
         "Write concise, human sentences with a helpful, calm tone (not a list). "
-        "If the question is subjective, share a light opinion grounded in facts. "
+        "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
-        "Avoid repeating the exact same observation as the last response if possible. "
+        "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
         "Do not invent numbers or facts. "
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)."
     )
@@ -2750,6 +2789,7 @@ def _ollama_call_safe(
     context: str,
     fallback: str,
     system_override: str | None = None,
+    model: str | None = None,
 ) -> str:
     try:
         return _ollama_call(
@@ -2758,6 +2798,7 @@ def _ollama_call_safe(
             context=context,
             use_history=False,
             system_override=system_override,
+            model=model,
         )
     except Exception:
         return fallback
@@ -2841,6 +2882,7 @@ def _open_ended_plan(
     history_lines: list[str],
     count: int,
     state: ThoughtState | None,
+    model: str | None,
 ) -> list[dict[str, Any]]:
     if state:
         state.update("planning", step=1, note="mapping angles")
@@ -2850,10 +2892,15 @@ def _open_ended_plan(
         f"{count} distinct answer angles that can be supported by the fact pack. "
         "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). "
         "If the question is subjective, propose at least one angle that surfaces a standout detail. "
+        "Avoid repeating the same angle as the most recent response if possible. "
         "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}."
     )
     context = _append_history_context(fact_pack, history_lines)
-    result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
     angles = result.get("angles") if isinstance(result, dict) else None
     cleaned: list[dict[str, Any]] = []
     seen: set[str] = set()
@@ -2883,6 +2930,81 @@ def _open_ended_plan(
     return cleaned
 
 
+def _preferred_tags_for_prompt(prompt: str) -> set[str]:
+    q = normalize_query(prompt)
+    tags: set[str] = set()
+    if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")):
+        tags.add("utilization")
+    if any(word in q for word in ("postgres", "database", "db", "connections")):
+        tags.add("database")
+    if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")):
+        tags.add("pods")
+    if any(word in q for word in ("workload", "service", "namespace")):
+        tags.add("workloads")
+    if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")):
+        tags.add("availability")
+    if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")):
+        tags.update({"hardware", "inventory", "architecture"})
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _open_ended_insights(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    count: int,
+    state: ThoughtState | None,
+    model: str | None,
+) -> list[dict[str, Any]]:
+    if state:
+        state.update("analyzing", note="scouting insights")
+    count = max(1, count)
+    allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS))
+    prompt_text = (
+        "Review the fact pack and propose up to "
+        f"{count} insights that could answer the question. "
+        "Each insight should be grounded in the facts. "
+        "Return JSON: {\"insights\":[{\"summary\":\"...\",\"fact_ids\":[\"F1\"],"
+        "\"relevance\":0-1,\"novelty\":0-1,\"tags\":[\"tag\"],\"rationale\":\"...\"}]}. "
+        f"Only use tags from: {allowed_tags}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    insights = result.get("insights") if isinstance(result, dict) else None
+    cleaned: list[dict[str, Any]] = []
+    valid_ids = set(fact_meta.keys())
+    if isinstance(insights, list):
+        for item in insights:
+            if not isinstance(item, dict):
+                continue
+            summary = str(item.get("summary") or item.get("claim") or "").strip()
+            if not summary:
+                continue
+            raw_ids = item.get("fact_ids") if isinstance(item.get("fact_ids"), list) else []
+            fact_ids = [fid for fid in raw_ids if isinstance(fid, str) and fid in valid_ids]
+            if not fact_ids:
+                continue
+            cleaned.append(
+                {
+                    "summary": summary,
+                    "fact_ids": fact_ids,
+                    "relevance": _normalize_fraction(item.get("relevance"), default=0.6),
+                    "novelty": _normalize_fraction(item.get("novelty"), default=0.5),
+                    "rationale": str(item.get("rationale") or ""),
+                    "tags": [t for t in (item.get("tags") or []) if isinstance(t, str)],
+                }
+            )
+    if cleaned and state:
+        state.update("analyzing", note=_candidate_note(cleaned[0]))
+    return cleaned
+
+
 def _normalize_score(value: Any, *, default: int = 60) -> int:
     if isinstance(value, (int, float)):
         return int(max(0, min(100, value)))
@@ -2915,20 +3037,31 @@ def _open_ended_candidate(
     history_lines: list[str],
     state: ThoughtState | None,
     step: int,
+    fact_hints: list[str] | None = None,
+    model: str | None = None,
 ) -> dict[str, Any]:
     if state:
         state.update("drafting", step=step, note=focus)
+    hint_text = ""
+    if fact_hints:
+        hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "."
     prompt_text = (
         "Using ONLY the fact pack, answer the question focusing on this angle: "
         f"{focus}. "
-        "Write 2-4 sentences in plain prose (not a list). "
+        "Write 2-4 sentences in plain prose (not a list)."
+        + hint_text
+        + " "
         "If you infer, label it as inference. "
         "List which fact pack IDs you used. "
         "Return JSON: {\"answer\":\"...\",\"facts_used\":[\"F1\"],\"confidence\":\"high|medium|low\","
         "\"relevance\":0-100,\"satisfaction\":0-100,\"risk\":\"low|medium|high\"}."
     )
     context = _append_history_context(fact_pack, history_lines)
-    result = _ollama_json_call(prompt_text + f" Question: {prompt}", context=context)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
     if not isinstance(result, dict):
         result = {}
     answer = str(result.get("answer") or "").strip()
@@ -2986,9 +3119,12 @@ def _open_ended_synthesize(
     candidates: list[dict[str, Any]],
     state: ThoughtState | None,
     step: int,
+    model: str | None,
+    critique: str | None = None,
 ) -> str:
     if state:
         state.update("synthesizing", step=step, note="composing answer")
+    critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n"
     synth_prompt = (
         "Compose the final answer to the question using the candidate answers below. "
         "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. "
@@ -3001,6 +3137,7 @@ def _open_ended_synthesize(
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), "
         "HallucinationRisk (low|medium|high).\n"
         f"Question: {prompt}\n"
+        f"{critique_block}"
         f"Candidates: {json.dumps(candidates, ensure_ascii=False)}"
     )
     context = _append_history_context(fact_pack, history_lines)
@@ -3010,20 +3147,55 @@ def _open_ended_synthesize(
         context=context,
         fallback="I don't have enough data to answer that.",
         system_override=_open_ended_system(),
+        model=model,
     )
     return _ensure_scores(reply)
 
 
+def _open_ended_critique(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    candidates: list[dict[str, Any]],
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+) -> str:
+    if state:
+        state.update("reviewing", step=step, note="quality check")
+    critique_prompt = (
+        "Review the candidate answers against the fact pack. "
+        "Identify any missing important detail or risky inference and give one sentence of guidance. "
+        "Return JSON: {\"guidance\":\"...\",\"risk\":\"low|medium|high\"}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        critique_prompt + f" Question: {prompt} Candidates: {json.dumps(candidates, ensure_ascii=False)}",
+        context=context,
+        model=model,
+    )
+    if isinstance(result, dict):
+        guidance = str(result.get("guidance") or "").strip()
+        if guidance:
+            return guidance
+    return ""
+
+
 def _open_ended_multi(
     prompt: str,
     *,
     fact_pack: str,
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
     history_lines: list[str],
     mode: str,
     state: ThoughtState | None = None,
 ) -> str:
+    model = _model_for_mode(mode)
     angle_count = 2 if mode == "fast" else 4
-    total_steps = 1 + angle_count + 2
+    insight_count = 2 if mode == "fast" else 4
+    total_steps = 2 + angle_count + 2 + (1 if mode == "deep" else 0)
     if state:
         state.total_steps = total_steps
     angles = _open_ended_plan(
@@ -3032,10 +3204,57 @@ def _open_ended_multi(
         history_lines=history_lines,
         count=angle_count,
         state=state,
+        model=model,
     )
+    insights = _open_ended_insights(
+        prompt,
+        fact_pack=fact_pack,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        count=insight_count,
+        state=state,
+        model=model,
+    )
+    seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count))
+    insight_candidates = insights + seeds
+    subjective = _is_subjective_query(prompt)
+    prefer_tags = _preferred_tags_for_prompt(prompt)
+    history_tags = _history_tags(history_lines)
+    avoid_tags = history_tags if subjective else set()
+    preference = "novelty" if subjective else "relevance"
+    selected_insights = _select_diverse_insights(
+        insight_candidates,
+        preference=preference,
+        prefer_tags=prefer_tags,
+        avoid_tags=avoid_tags,
+        history_tags=history_tags,
+        fact_meta=fact_meta,
+        count=1 if mode == "fast" else 2,
+    )
+    if state and selected_insights:
+        state.update("analyzing", note=_candidate_note(selected_insights[0]))
+
+    angle_inputs: list[dict[str, Any]] = []
+    for insight in selected_insights:
+        angle_inputs.append(
+            {
+                "focus": str(insight.get("summary") or "Direct answer"),
+                "fact_ids": insight.get("fact_ids") or [],
+            }
+        )
+    for angle in angles:
+        if len(angle_inputs) >= angle_count:
+            break
+        angle_inputs.append(
+            {
+                "focus": str(angle.get("focus") or "Direct answer"),
+                "fact_ids": [],
+            }
+        )
+
     candidates: list[dict[str, Any]] = []
-    step = 2
-    for angle in angles[:angle_count]:
+    step = 3
+    for angle in angle_inputs[:angle_count]:
         candidates.append(
             _open_ended_candidate(
                 prompt,
@@ -3044,6 +3263,8 @@ def _open_ended_multi(
                 history_lines=history_lines,
                 state=state,
                 step=step,
+                fact_hints=angle.get("fact_ids") if isinstance(angle.get("fact_ids"), list) else None,
+                model=model,
             )
         )
         step += 1
@@ -3051,6 +3272,18 @@ def _open_ended_multi(
         state.update("evaluating", step=step, note="ranking candidates")
     selected = _select_candidates(candidates, count=1 if mode == "fast" else 2)
     step += 1
+    critique = ""
+    if mode == "deep":
+        critique = _open_ended_critique(
+            prompt,
+            fact_pack=fact_pack,
+            history_lines=history_lines,
+            candidates=selected or candidates,
+            state=state,
+            step=step,
+            model=model,
+        )
+        step += 1
     reply = _open_ended_synthesize(
         prompt,
         fact_pack=fact_pack,
@@ -3058,6 +3291,8 @@ def _open_ended_multi(
         candidates=selected or candidates,
         state=state,
         step=step,
+        model=model,
+        critique=critique,
     )
     if state:
         state.update("done", step=total_steps)
@@ -3066,19 +3301,23 @@ def _open_ended_multi(
 
 def _open_ended_total_steps(mode: str) -> int:
     angle_count = 2 if mode == "fast" else 4
-    return 1 + angle_count + 2
+    return 2 + angle_count + 2 + (1 if mode == "deep" else 0)
 
 
 def _open_ended_fast(
     prompt: str,
     *,
     fact_pack: str,
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
     history_lines: list[str],
     state: ThoughtState | None = None,
 ) -> str:
     return _open_ended_multi(
         prompt,
         fact_pack=fact_pack,
+        fact_lines=fact_lines,
+        fact_meta=fact_meta,
         history_lines=history_lines,
         mode="fast",
         state=state,
@@ -3089,12 +3328,16 @@ def _open_ended_deep(
     prompt: str,
     *,
     fact_pack: str,
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
     history_lines: list[str],
     state: ThoughtState | None = None,
 ) -> str:
     return _open_ended_multi(
         prompt,
         fact_pack=fact_pack,
+        fact_lines=fact_lines,
+        fact_meta=fact_meta,
         history_lines=history_lines,
         mode="deep",
         state=state,
@@ -3109,31 +3352,61 @@ def open_ended_answer(
     workloads: list[dict[str, Any]],
     history_lines: list[str],
     mode: str,
+    allow_tools: bool,
     state: ThoughtState | None = None,
 ) -> str:
     lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if _knowledge_intent(prompt) or _doc_intent(prompt):
+        kb_detail = kb_retrieve(prompt)
+        if kb_detail:
+            for line in kb_detail.splitlines():
+                if line.strip():
+                    lines.append(line.strip())
+    tool_lines = _tool_fact_lines(prompt, allow_tools=allow_tools)
+    if tool_lines:
+        lines.extend(tool_lines)
     if not lines:
         return _ensure_scores("I don't have enough data to answer that.")
     fact_pack = _fact_pack_text(lines)
+    fact_meta = _fact_pack_meta(lines)
     if mode == "fast":
         return _open_ended_fast(
             prompt,
             fact_pack=fact_pack,
+            fact_lines=lines,
+            fact_meta=fact_meta,
             history_lines=history_lines,
             state=state,
         )
     return _open_ended_deep(
         prompt,
         fact_pack=fact_pack,
+        fact_lines=lines,
+        fact_meta=fact_meta,
         history_lines=history_lines,
         state=state,
     )
 
 
-def _non_cluster_reply(prompt: str) -> str:
-    return _ensure_scores(
-        "I focus on the Atlas/Othrys cluster and don't have enough data to answer that."
+def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> str:
+    system = (
+        "System: You are Atlas, a helpful general assistant. "
+        "Answer using common knowledge when possible, and say when you're unsure. "
+        "Be concise and avoid unnecessary caveats. "
+        "Respond in plain sentences (no lists unless asked). "
+        "End every response with a line: 'Confidence: high|medium|low'."
     )
+    model = _model_for_mode(mode)
+    context = _append_history_context("", history_lines) if history_lines else ""
+    reply = _ollama_call(
+        ("general", "reply"),
+        prompt,
+        context=context,
+        use_history=False,
+        system_override=system,
+        model=model,
+    )
+    return _ensure_scores(reply)
 
 
 # Internal HTTP endpoint for cluster answers (website uses this).
@@ -3183,7 +3456,11 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
             return
         cleaned = _strip_bot_mention(prompt)
         mode = str(payload.get("mode") or "deep").lower()
-        if mode not in ("fast", "deep"):
+        if mode in ("quick", "fast"):
+            mode = "fast"
+        elif mode in ("smart", "deep"):
+            mode = "deep"
+        else:
             mode = "deep"
         snapshot = _snapshot_state()
         inventory = _snapshot_inventory(snapshot) or node_inventory_live()
@@ -3212,37 +3489,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 snapshot=snapshot,
                 workloads=workloads,
             )
-        fallback = "I don't have enough data to answer that."
         if cluster_query:
-            open_ended = (
-                _is_subjective_query(cleaned)
-                or _knowledge_intent(cleaned)
-                or _is_overview_query(cleaned)
-                or _doc_intent(cleaned)
+            answer = open_ended_answer(
+                cleaned,
+                inventory=inventory,
+                snapshot=snapshot,
+                workloads=workloads,
+                history_lines=history_lines,
+                mode=mode,
+                allow_tools=False,
+                state=None,
             )
-            if open_ended:
-                answer = open_ended_answer(
-                    cleaned,
-                    inventory=inventory,
-                    snapshot=snapshot,
-                    workloads=workloads,
-                    history_lines=history_lines,
-                    mode=mode,
-                    state=None,
-                )
-            else:
-                answer = (
-                    cluster_answer(
-                        cleaned,
-                        inventory=inventory,
-                        snapshot=snapshot,
-                        workloads=workloads,
-                        history_lines=history_lines,
-                    )
-                    or fallback
-                )
         else:
-            answer = _non_cluster_reply(cleaned)
+            answer = _non_cluster_reply(cleaned, history_lines=history_lines, mode=mode)
         self._write_json(200, {"answer": answer})
 
 
@@ -3490,6 +3749,7 @@ def _ollama_call(
     context: str,
     use_history: bool = True,
     system_override: str | None = None,
+    model: str | None = None,
 ) -> str:
     system = system_override or (
         "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
@@ -3521,7 +3781,8 @@ def _ollama_call(
         messages.extend(_history_to_messages(history[hist_key][-24:]))
     messages.append({"role": "user", "content": prompt})
 
-    payload = {"model": MODEL, "messages": messages, "stream": False}
+    model_name = model or MODEL
+    payload = {"model": model_name, "messages": messages, "stream": False}
     headers = {"Content-Type": "application/json"}
     if API_KEY:
         headers["x-api-key"] = API_KEY
@@ -3561,11 +3822,18 @@ def ollama_reply(
     context: str,
     fallback: str = "",
     use_history: bool = True,
+    model: str | None = None,
 ) -> str:
     last_error = None
     for attempt in range(max(1, OLLAMA_RETRIES + 1)):
         try:
-            return _ollama_call(hist_key, prompt, context=context, use_history=use_history)
+            return _ollama_call(
+                hist_key,
+                prompt,
+                context=context,
+                use_history=use_history,
+                model=model,
+            )
         except Exception as exc:  # noqa: BLE001
             last_error = exc
             time.sleep(min(4, 2 ** attempt))
@@ -3584,6 +3852,7 @@ def ollama_reply_with_thinking(
     context: str,
     fallback: str,
     use_history: bool = True,
+    model: str | None = None,
 ) -> str:
     result: dict[str, str] = {"reply": ""}
     done = threading.Event()
@@ -3595,6 +3864,7 @@ def ollama_reply_with_thinking(
             context=context,
             fallback=fallback,
             use_history=use_history,
+            model=model,
         )
         done.set()
 
@@ -3627,6 +3897,7 @@ def open_ended_with_thinking(
     workloads: list[dict[str, Any]],
     history_lines: list[str],
     mode: str,
+    allow_tools: bool,
 ) -> str:
     result: dict[str, str] = {"reply": ""}
     done = threading.Event()
@@ -3641,6 +3912,7 @@ def open_ended_with_thinking(
             workloads=workloads,
             history_lines=history_lines,
             mode=mode,
+            allow_tools=allow_tools,
             state=state,
         )
         done.set()
@@ -3766,39 +4038,24 @@ def sync_loop(token: str, room_id: str):
                     extra = "VictoriaMetrics (PromQL result):\n" + rendered
                     send_msg(token, rid, extra)
                     continue
-                fallback = "I don't have enough data to answer that."
-
                 if cluster_query:
-                    open_ended = (
-                        _is_subjective_query(cleaned_body)
-                        or _knowledge_intent(cleaned_body)
-                        or _is_overview_query(cleaned_body)
-                        or _doc_intent(cleaned_body)
+                    reply = open_ended_with_thinking(
+                        token,
+                        rid,
+                        cleaned_body,
+                        inventory=inventory,
+                        snapshot=snapshot,
+                        workloads=workloads,
+                        history_lines=history[hist_key],
+                        mode=mode if mode in ("fast", "deep") else "deep",
+                        allow_tools=allow_tools,
                     )
-                    if open_ended:
-                        reply = open_ended_with_thinking(
-                            token,
-                            rid,
-                            cleaned_body,
-                            inventory=inventory,
-                            snapshot=snapshot,
-                            workloads=workloads,
-                            history_lines=history[hist_key],
-                            mode=mode if mode in ("fast", "deep") else "deep",
-                        )
-                    else:
-                        reply = (
-                            cluster_answer(
-                                cleaned_body,
-                                inventory=inventory,
-                                snapshot=snapshot,
-                                workloads=workloads,
-                                history_lines=history[hist_key],
-                            )
-                            or fallback
-                        )
                 else:
-                    reply = _non_cluster_reply(cleaned_body)
+                    reply = _non_cluster_reply(
+                        cleaned_body,
+                        history_lines=history[hist_key],
+                        mode=mode if mode in ("fast", "deep") else "deep",
+                    )
                 send_msg(token, rid, reply)
                 history[hist_key].append(f"Atlas: {reply}")
                 history[hist_key] = history[hist_key][-80:]

From 86623b4596016c1832a21eb63ba6366387db13ca Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:49:28 -0300
Subject: [PATCH 385/416] atlasbot: fix insight scoring

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index b08f20db..26699b3c 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-82
+        checksum/atlasbot-configmap: manual-atlasbot-83
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 7e6341e6..dd6ea2ee 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2724,6 +2724,14 @@ def _insight_score(
     return base
 
 
+def _score_insight(insight: dict[str, Any], preference: str) -> float:
+    relevance = _normalize_fraction(insight.get("relevance"), default=0.6)
+    novelty = _normalize_fraction(insight.get("novelty"), default=0.5)
+    if preference == "novelty":
+        return novelty * 0.6 + relevance * 0.4
+    return relevance * 0.6 + novelty * 0.4
+
+
 def _select_diverse_insights(
     candidates: list[dict[str, Any]],
     *,

From a8bebb39346e83a01ee10cdfca8904a7a88529a0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 27 Jan 2026 23:57:36 -0300
Subject: [PATCH 386/416] atlasbot: speed up fast mode

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 57 +++++++++++++++----------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 26699b3c..b9b8ea70 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-83
+        checksum/atlasbot-configmap: manual-atlasbot-84
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index dd6ea2ee..91084783 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3201,28 +3201,37 @@ def _open_ended_multi(
     state: ThoughtState | None = None,
 ) -> str:
     model = _model_for_mode(mode)
-    angle_count = 2 if mode == "fast" else 4
-    insight_count = 2 if mode == "fast" else 4
-    total_steps = 2 + angle_count + 2 + (1 if mode == "deep" else 0)
+    if mode == "fast":
+        angle_count = 1
+        insight_count = 1
+        total_steps = 2
+    else:
+        angle_count = 4
+        insight_count = 4
+        total_steps = 2 + angle_count + 2 + 1
     if state:
         state.total_steps = total_steps
-    angles = _open_ended_plan(
-        prompt,
-        fact_pack=fact_pack,
-        history_lines=history_lines,
-        count=angle_count,
-        state=state,
-        model=model,
-    )
-    insights = _open_ended_insights(
-        prompt,
-        fact_pack=fact_pack,
-        fact_meta=fact_meta,
-        history_lines=history_lines,
-        count=insight_count,
-        state=state,
-        model=model,
-    )
+
+    angles: list[dict[str, Any]] = []
+    insights: list[dict[str, Any]] = []
+    if mode != "fast":
+        angles = _open_ended_plan(
+            prompt,
+            fact_pack=fact_pack,
+            history_lines=history_lines,
+            count=angle_count,
+            state=state,
+            model=model,
+        )
+        insights = _open_ended_insights(
+            prompt,
+            fact_pack=fact_pack,
+            fact_meta=fact_meta,
+            history_lines=history_lines,
+            count=insight_count,
+            state=state,
+            model=model,
+        )
     seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count))
     insight_candidates = insights + seeds
     subjective = _is_subjective_query(prompt)
@@ -3261,7 +3270,7 @@ def _open_ended_multi(
         )
 
     candidates: list[dict[str, Any]] = []
-    step = 3
+    step = 1 if mode == "fast" else 3
     for angle in angle_inputs[:angle_count]:
         candidates.append(
             _open_ended_candidate(
@@ -3308,8 +3317,10 @@ def _open_ended_multi(
 
 
 def _open_ended_total_steps(mode: str) -> int:
-    angle_count = 2 if mode == "fast" else 4
-    return 2 + angle_count + 2 + (1 if mode == "deep" else 0)
+    if mode == "fast":
+        return 2
+    angle_count = 4
+    return 2 + angle_count + 2 + 1
 
 
 def _open_ended_fast(

From 683dad9e201d72389f46b3fe58036f121b74cb01 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 00:22:32 -0300
Subject: [PATCH 387/416] atlasbot: improve multi-pass synthesis

---
 services/comms/scripts/atlasbot/bot.py | 307 +++++++++++++++++++------
 1 file changed, 239 insertions(+), 68 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 91084783..df718e6e 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2559,8 +2559,13 @@ def _fact_pack_lines(
     return lines
 
 
-def _fact_pack_text(lines: list[str]) -> str:
-    labeled = [f"F{idx + 1}: {line}" for idx, line in enumerate(lines)]
+def _fact_pack_text(lines: list[str], fact_meta: dict[str, dict[str, Any]]) -> str:
+    labeled: list[str] = []
+    for idx, line in enumerate(lines):
+        fid = f"F{idx + 1}"
+        tags = fact_meta.get(fid, {}).get("tags") or []
+        tag_text = f" [tags: {', '.join(tags)}]" if tags else ""
+        labeled.append(f"{fid}{tag_text}: {line}")
     return "Fact pack:\n" + "\n".join(labeled)
 
 
@@ -2782,7 +2787,8 @@ def _open_ended_system() -> str:
         "Use ONLY the provided fact pack and recent chat as your evidence. "
         "You may draw light inferences if you label them as such. "
         "Write concise, human sentences with a helpful, calm tone (not a list). "
-        "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. "
+        "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. "
+        "If the question asks for a list, embed the list inline in a sentence (comma-separated). "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
         "Do not invent numbers or facts. "
@@ -2938,6 +2944,67 @@ def _open_ended_plan(
     return cleaned
 
 
+def _sanitize_focus_tags(raw_tags: list[Any]) -> list[str]:
+    tags: list[str] = []
+    for tag in raw_tags:
+        if not isinstance(tag, str):
+            continue
+        tag = tag.strip()
+        if tag in _ALLOWED_INSIGHT_TAGS and tag not in tags:
+            tags.append(tag)
+    return tags
+
+
+def _open_ended_interpret(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    state: ThoughtState | None,
+    model: str | None,
+) -> dict[str, Any]:
+    if state:
+        state.update("interpreting", step=1, note="reading question")
+    allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS))
+    prompt_text = (
+        "Classify how to answer the question using only the fact pack. "
+        "Return JSON: {\"style\":\"objective|subjective\","
+        "\"tone\":\"neutral|curious|enthusiastic\","
+        "\"focus_tags\":[\"tag\"],"
+        "\"focus_label\":\"short phrase\","
+        "\"allow_list\":true|false}. "
+        "Use allow_list=true only if the question explicitly asks for names or lists. "
+        f"Only use tags from: {allowed_tags}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    if not isinstance(result, dict):
+        result = {}
+    style = str(result.get("style") or "").strip().lower()
+    if style not in ("objective", "subjective"):
+        style = "subjective" if _is_subjective_query(prompt) else "objective"
+    tone = str(result.get("tone") or "neutral").strip().lower()
+    if tone not in ("neutral", "curious", "enthusiastic"):
+        tone = "neutral"
+    focus_tags = _sanitize_focus_tags(result.get("focus_tags") or [])
+    focus_label = str(result.get("focus_label") or "").strip()
+    allow_list = result.get("allow_list")
+    if not isinstance(allow_list, bool):
+        q = normalize_query(prompt)
+        allow_list = any(phrase in q for phrase in ("list", "which", "what are", "names"))
+    return {
+        "style": style,
+        "tone": tone,
+        "focus_tags": focus_tags,
+        "focus_label": focus_label,
+        "allow_list": allow_list,
+    }
+
+
 def _preferred_tags_for_prompt(prompt: str) -> set[str]:
     q = normalize_query(prompt)
     tags: set[str] = set()
@@ -3013,6 +3080,71 @@ def _open_ended_insights(
     return cleaned
 
 
+def _fallback_fact_ids(
+    fact_meta: dict[str, dict[str, Any]],
+    *,
+    focus_tags: set[str],
+    count: int,
+) -> list[str]:
+    if not fact_meta:
+        return []
+    if focus_tags:
+        tagged = [
+            fid
+            for fid, meta in fact_meta.items()
+            if focus_tags & set(meta.get("tags") or [])
+        ]
+        if tagged:
+            return tagged[:count]
+    return list(fact_meta.keys())[:count]
+
+
+def _open_ended_select_facts(
+    prompt: str,
+    *,
+    fact_pack: str,
+    fact_meta: dict[str, dict[str, Any]],
+    history_lines: list[str],
+    focus_tags: set[str],
+    avoid_fact_ids: list[str],
+    count: int,
+    subjective: bool,
+    state: ThoughtState | None,
+    step: int,
+    model: str | None,
+) -> list[str]:
+    if state:
+        state.update("selecting facts", step=step, note="picking evidence")
+    focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any"
+    avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none"
+    prompt_text = (
+        "Select the fact IDs that best answer the question. "
+        f"Pick up to {count} fact IDs. "
+        f"Focus tags: {focus_hint}. "
+        f"Avoid these fact IDs: {avoid_hint}. "
+        "If the question is subjective, pick standout or unusual facts; "
+        "if objective, pick the minimal facts needed. "
+        "Return JSON: {\"fact_ids\":[\"F1\"...],\"note\":\"...\"}."
+    )
+    context = _append_history_context(fact_pack, history_lines)
+    result = _ollama_json_call(
+        prompt_text + f" Question: {prompt}",
+        context=context,
+        model=model,
+    )
+    fact_ids = result.get("fact_ids") if isinstance(result, dict) else None
+    selected: list[str] = []
+    if isinstance(fact_ids, list):
+        for fid in fact_ids:
+            if isinstance(fid, str) and fid in fact_meta and fid not in selected:
+                selected.append(fid)
+            if len(selected) >= count:
+                break
+    if not selected:
+        selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count)
+    return selected
+
+
 def _normalize_score(value: Any, *, default: int = 60) -> int:
     if isinstance(value, (int, float)):
         return int(max(0, min(100, value)))
@@ -3043,6 +3175,9 @@ def _open_ended_candidate(
     focus: str,
     fact_pack: str,
     history_lines: list[str],
+    subjective: bool,
+    tone: str,
+    allow_list: bool,
     state: ThoughtState | None,
     step: int,
     fact_hints: list[str] | None = None,
@@ -3053,10 +3188,23 @@ def _open_ended_candidate(
     hint_text = ""
     if fact_hints:
         hint_text = " Prioritize these fact IDs if relevant: " + ", ".join(fact_hints) + "."
+    style_hint = (
+        "Offer a brief opinion grounded in facts and explain why it stands out. "
+        if subjective
+        else "Answer directly and succinctly. "
+    )
+    list_hint = (
+        "If a list is requested, embed it inline in a sentence (comma-separated). "
+        if allow_list
+        else "Avoid bullet lists. "
+    )
     prompt_text = (
         "Using ONLY the fact pack, answer the question focusing on this angle: "
         f"{focus}. "
-        "Write 2-4 sentences in plain prose (not a list)."
+        f"Tone: {tone}. "
+        + style_hint
+        + list_hint
+        + "Write 2-4 sentences in plain prose."
         + hint_text
         + " "
         "If you infer, label it as inference. "
@@ -3125,6 +3273,9 @@ def _open_ended_synthesize(
     fact_pack: str,
     history_lines: list[str],
     candidates: list[dict[str, Any]],
+    subjective: bool,
+    tone: str,
+    allow_list: bool,
     state: ThoughtState | None,
     step: int,
     model: str | None,
@@ -3133,6 +3284,16 @@ def _open_ended_synthesize(
     if state:
         state.update("synthesizing", step=step, note="composing answer")
     critique_block = f"\nCritique guidance: {critique}\n" if critique else "\n"
+    style_hint = (
+        "If the question is subjective, share a light opinion grounded in facts and explain why it stands out. "
+        if subjective
+        else "Answer directly without extra caveats. "
+    )
+    list_hint = (
+        "If a list is requested, embed it inline in a sentence (comma-separated). "
+        if allow_list
+        else "Avoid bullet lists. "
+    )
     synth_prompt = (
         "Compose the final answer to the question using the candidate answers below. "
         "Select the best 1-2 candidates, blend them if helpful, and keep 2-4 sentences. "
@@ -3140,7 +3301,10 @@ def _open_ended_synthesize(
         "If you infer, label it as inference. "
         "Do not claim nodes are missing or not ready unless the fact pack explicitly lists "
         "nodes_not_ready or expected_workers_missing. "
-        "Keep the tone conversational and answer the user's intent directly. "
+        f"Tone: {tone}. "
+        + style_hint
+        + list_hint
+        + "Keep the tone conversational and answer the user's intent directly. "
         "Avoid repeating the last response if possible. "
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), "
         "HallucinationRisk (low|medium|high).\n"
@@ -3202,85 +3366,90 @@ def _open_ended_multi(
 ) -> str:
     model = _model_for_mode(mode)
     if mode == "fast":
-        angle_count = 1
-        insight_count = 1
-        total_steps = 2
+        total_steps = 4
     else:
-        angle_count = 4
-        insight_count = 4
-        total_steps = 2 + angle_count + 2 + 1
+        total_steps = 7
     if state:
         state.total_steps = total_steps
 
-    angles: list[dict[str, Any]] = []
-    insights: list[dict[str, Any]] = []
-    if mode != "fast":
-        angles = _open_ended_plan(
-            prompt,
-            fact_pack=fact_pack,
-            history_lines=history_lines,
-            count=angle_count,
-            state=state,
-            model=model,
-        )
-        insights = _open_ended_insights(
+    interpretation = _open_ended_interpret(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        state=state,
+        model=model,
+    )
+    style = interpretation.get("style") or "objective"
+    subjective = style == "subjective" or _is_subjective_query(prompt)
+    tone = str(interpretation.get("tone") or "").strip().lower()
+    if tone not in ("neutral", "curious", "enthusiastic"):
+        tone = "curious" if subjective else "neutral"
+    allow_list = bool(interpretation.get("allow_list"))
+    focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt)
+    if not focus_tags and subjective:
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+
+    primary_ids = _open_ended_select_facts(
+        prompt,
+        fact_pack=fact_pack,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        focus_tags=focus_tags,
+        avoid_fact_ids=[],
+        count=4 if mode == "deep" else 3,
+        subjective=subjective,
+        state=state,
+        step=2,
+        model=model,
+    )
+    alternate_ids: list[str] = []
+    if mode == "deep":
+        alternate_ids = _open_ended_select_facts(
             prompt,
             fact_pack=fact_pack,
             fact_meta=fact_meta,
             history_lines=history_lines,
-            count=insight_count,
+            focus_tags=focus_tags,
+            avoid_fact_ids=primary_ids,
+            count=4,
+            subjective=subjective,
             state=state,
+            step=3,
             model=model,
         )
-    seeds = _seed_insights(fact_lines, fact_meta, limit=max(4, insight_count))
-    insight_candidates = insights + seeds
-    subjective = _is_subjective_query(prompt)
-    prefer_tags = _preferred_tags_for_prompt(prompt)
-    history_tags = _history_tags(history_lines)
-    avoid_tags = history_tags if subjective else set()
-    preference = "novelty" if subjective else "relevance"
-    selected_insights = _select_diverse_insights(
-        insight_candidates,
-        preference=preference,
-        prefer_tags=prefer_tags,
-        avoid_tags=avoid_tags,
-        history_tags=history_tags,
-        fact_meta=fact_meta,
-        count=1 if mode == "fast" else 2,
-    )
-    if state and selected_insights:
-        state.update("analyzing", note=_candidate_note(selected_insights[0]))
-
-    angle_inputs: list[dict[str, Any]] = []
-    for insight in selected_insights:
-        angle_inputs.append(
-            {
-                "focus": str(insight.get("summary") or "Direct answer"),
-                "fact_ids": insight.get("fact_ids") or [],
-            }
-        )
-    for angle in angles:
-        if len(angle_inputs) >= angle_count:
-            break
-        angle_inputs.append(
-            {
-                "focus": str(angle.get("focus") or "Direct answer"),
-                "fact_ids": [],
-            }
-        )
 
     candidates: list[dict[str, Any]] = []
-    step = 1 if mode == "fast" else 3
-    for angle in angle_inputs[:angle_count]:
+    focus_label = interpretation.get("focus_label") or "primary angle"
+    step = 3 if mode == "fast" else 4
+    candidates.append(
+        _open_ended_candidate(
+            prompt,
+            focus=str(focus_label),
+            fact_pack=fact_pack,
+            history_lines=history_lines,
+            subjective=subjective,
+            tone=str(tone),
+            allow_list=allow_list,
+            state=state,
+            step=step,
+            fact_hints=primary_ids,
+            model=model,
+        )
+    )
+    step += 1
+    if mode == "deep" and alternate_ids:
         candidates.append(
             _open_ended_candidate(
                 prompt,
-                focus=str(angle.get("focus") or "Direct answer"),
+                focus="alternate angle",
                 fact_pack=fact_pack,
                 history_lines=history_lines,
+                subjective=subjective,
+                tone=str(tone),
+                allow_list=allow_list,
                 state=state,
                 step=step,
-                fact_hints=angle.get("fact_ids") if isinstance(angle.get("fact_ids"), list) else None,
+                fact_hints=alternate_ids,
                 model=model,
             )
         )
@@ -3306,6 +3475,9 @@ def _open_ended_multi(
         fact_pack=fact_pack,
         history_lines=history_lines,
         candidates=selected or candidates,
+        subjective=subjective,
+        tone=str(tone),
+        allow_list=allow_list,
         state=state,
         step=step,
         model=model,
@@ -3318,9 +3490,8 @@ def _open_ended_multi(
 
 def _open_ended_total_steps(mode: str) -> int:
     if mode == "fast":
-        return 2
-    angle_count = 4
-    return 2 + angle_count + 2 + 1
+        return 4
+    return 7
 
 
 def _open_ended_fast(
@@ -3386,8 +3557,8 @@ def open_ended_answer(
         lines.extend(tool_lines)
     if not lines:
         return _ensure_scores("I don't have enough data to answer that.")
-    fact_pack = _fact_pack_text(lines)
     fact_meta = _fact_pack_meta(lines)
+    fact_pack = _fact_pack_text(lines, fact_meta)
     if mode == "fast":
         return _open_ended_fast(
             prompt,

From 2f6a64870b5b0009e3d064d514c8d6e6f29ac356 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 00:24:13 -0300
Subject: [PATCH 388/416] atlasbot: roll config

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index b9b8ea70..bc6790bd 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-84
+        checksum/atlasbot-configmap: manual-atlasbot-85
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 659d07a7aa33597a5491a1a0cdfca65c900c4368 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:02:14 -0300
Subject: [PATCH 389/416] atlasbot: enrich fact pack and selection

---
 services/comms/scripts/atlasbot/bot.py | 104 +++++++++++++++++++++++--
 1 file changed, 96 insertions(+), 8 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index df718e6e..55c6da2b 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -936,6 +936,28 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]:
             per_node.setdefault(node, {})[metric_name] = entry.get("value")
     return [{"node": node, **vals} for node, vals in sorted(per_node.items())]
 
+def _usage_extremes(usage_table: list[dict[str, Any]]) -> dict[str, tuple[str, float]]:
+    extremes: dict[str, tuple[str, float]] = {}
+    for metric in ("cpu", "ram", "net", "io"):
+        values: list[tuple[str, float]] = []
+        for entry in usage_table:
+            node = entry.get("node")
+            raw = entry.get(metric)
+            if not node or raw is None:
+                continue
+            try:
+                value = float(raw)
+            except (TypeError, ValueError):
+                continue
+            values.append((node, value))
+        if not values:
+            continue
+        lowest = min(values, key=lambda item: item[1])
+        highest = max(values, key=lambda item: item[1])
+        extremes[f"min_{metric}"] = lowest
+        extremes[f"max_{metric}"] = highest
+    return extremes
+
 def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]:
     cleaned: list[dict[str, Any]] = []
     for entry in workloads:
@@ -1023,6 +1045,13 @@ def facts_context(
             lines.append(f"- arch {key}: {', '.join(nodes_list)}")
     if control_plane_nodes:
         lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}")
+        control_plane_by_hw: dict[str, list[str]] = collections.defaultdict(list)
+        for node in inv:
+            if node.get("name") in control_plane_nodes:
+                control_plane_by_hw[node.get("hardware") or "unknown"].append(node["name"])
+        parts = [f"{hw}={', '.join(sorted(nodes))}" for hw, nodes in sorted(control_plane_by_hw.items())]
+        if parts:
+            lines.append(f"- control_plane_by_hardware: {', '.join(parts)}")
     if worker_nodes:
         lines.append(f"- worker_nodes: {', '.join(worker_nodes)}")
     if ready_workers or not_ready_workers:
@@ -1068,6 +1097,22 @@ def facts_context(
         if value is not None:
             lines.append(f"- {key}: {value}")
 
+    top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else []
+    if top_restarts:
+        items = []
+        for entry in top_restarts[:5]:
+            if not isinstance(entry, dict):
+                continue
+            metric = entry.get("metric") or {}
+            pod = metric.get("pod") or metric.get("name") or ""
+            ns = metric.get("namespace") or ""
+            value = entry.get("value")
+            label = f"{ns}/{pod}".strip("/")
+            if label and value is not None:
+                items.append(f"{label}={value}")
+        if items:
+            lines.append(f"- top_restarts_1h: {', '.join(items)}")
+
     usage_table = _node_usage_table(metrics)
     if usage_table:
         lines.append("- node_usage (cpu/ram/net/io):")
@@ -1088,6 +1133,18 @@ def facts_context(
                 else ""
             )
             lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")
+        extremes = _usage_extremes(usage_table)
+        for metric in ("cpu", "ram", "net", "io"):
+            min_key = f"min_{metric}"
+            if min_key not in extremes:
+                continue
+            node, value = extremes[min_key]
+            value_fmt = _format_metric_value(
+                str(value),
+                percent=metric in ("cpu", "ram"),
+                rate=metric in ("net", "io"),
+            )
+            lines.append(f"- lowest_{metric}: {node} ({value_fmt})")
 
     if nodes_in_query:
         lines.append("- node_details:")
@@ -1112,13 +1169,37 @@ def facts_context(
             wl = entry.get("workload") or ""
             primary = entry.get("primary_node") or ""
             pods_total = entry.get("pods_total")
+            pods_running = entry.get("pods_running")
             label = f"{ns}/{wl}" if ns and wl else (wl or ns)
             if not label:
                 continue
             if primary:
-                lines.append(f"  - {label}: primary_node={primary}, pods_total={pods_total}")
+                lines.append(
+                    f"  - {label}: primary_node={primary}, pods_total={pods_total}, pods_running={pods_running}"
+                )
             else:
-                lines.append(f"  - {label}: pods_total={pods_total}")
+                lines.append(f"  - {label}: pods_total={pods_total}, pods_running={pods_running}")
+        top = max(
+            (entry for entry in workload_entries if isinstance(entry.get("pods_total"), (int, float))),
+            key=lambda item: item.get("pods_total", 0),
+            default=None,
+        )
+        if isinstance(top, dict) and top.get("pods_total") is not None:
+            label = f"{top.get('namespace')}/{top.get('workload')}".strip("/")
+            lines.append(f"- workload_most_pods: {label} ({top.get('pods_total')})")
+        zero_running = [
+            entry
+            for entry in workload_entries
+            if isinstance(entry.get("pods_running"), (int, float)) and entry.get("pods_running") == 0
+        ]
+        if zero_running:
+            labels = []
+            for entry in zero_running:
+                label = f"{entry.get('namespace')}/{entry.get('workload')}".strip("/")
+                if label:
+                    labels.append(label)
+            if labels:
+                lines.append(f"- workloads_zero_running: {', '.join(labels)}")
 
     rendered = "\n".join(lines)
     return rendered[:MAX_FACTS_CHARS]
@@ -2609,15 +2690,15 @@ def _fact_line_tags(line: str) -> set[str]:
         tags.add("architecture")
     if any(key in text for key in ("rpi", "jetson", "amd64", "arm64", "non_raspberry_pi")):
         tags.update({"hardware", "inventory"})
-    if "control_plane_nodes" in text or "worker_nodes" in text:
+    if "control_plane_nodes" in text or "control_plane_by_hardware" in text or "worker_nodes" in text:
         tags.add("inventory")
-    if any(key in text for key in ("hottest_", "node_usage", "cpu=", "ram=", "net=", "io=")):
+    if any(key in text for key in ("hottest_", "lowest_", "node_usage", "cpu=", "ram=", "net=", "io=")):
         tags.add("utilization")
     if "postgres_" in text or "postgres connections" in text:
         tags.add("database")
-    if "pods_" in text or "pod phases" in text:
+    if "pods_" in text or "pod phases" in text or "restarts" in text:
         tags.add("pods")
-    if "workloads" in text or "primary_node" in text:
+    if "workloads" in text or "primary_node" in text or "workload_" in text:
         tags.add("workloads")
     if "node_details" in text:
         tags.add("node_detail")
@@ -3140,8 +3221,15 @@ def _open_ended_select_facts(
                 selected.append(fid)
             if len(selected) >= count:
                 break
-    if not selected:
-        selected = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count)
+    seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count)
+    if selected:
+        for fid in seed:
+            if fid not in selected:
+                selected.append(fid)
+            if len(selected) >= count:
+                break
+    else:
+        selected = seed
     return selected
 
 

From d151bcde6fa9e532893802ad197127bcf35e1805 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:02:32 -0300
Subject: [PATCH 390/416] atlasbot: roll config

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index bc6790bd..7ce144c6 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-85
+        checksum/atlasbot-configmap: manual-atlasbot-86
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 62feb26afd1cb7719b299f73773a72f47e9971c7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:07:13 -0300
Subject: [PATCH 391/416] atlasbot: add per-hardware extremes

---
 services/comms/scripts/atlasbot/bot.py | 27 ++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 55c6da2b..50fed4b6 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1145,6 +1145,33 @@ def facts_context(
                 rate=metric in ("net", "io"),
             )
             lines.append(f"- lowest_{metric}: {node} ({value_fmt})")
+        for metric in ("cpu", "ram"):
+            hottest_parts: list[str] = []
+            lowest_parts: list[str] = []
+            for hw, nodes_list in sorted(by_hardware.items()):
+                entries = []
+                for entry in usage_table:
+                    node = entry.get("node")
+                    if node in nodes_list and entry.get(metric) is not None:
+                        try:
+                            value = float(entry.get(metric))
+                        except (TypeError, ValueError):
+                            continue
+                        entries.append((node, value))
+                if not entries:
+                    continue
+                max_node, max_val = max(entries, key=lambda item: item[1])
+                min_node, min_val = min(entries, key=lambda item: item[1])
+                hottest_parts.append(
+                    f"{hw}={max_node} ({_format_metric_value(str(max_val), percent=True)})"
+                )
+                lowest_parts.append(
+                    f"{hw}={min_node} ({_format_metric_value(str(min_val), percent=True)})"
+                )
+            if hottest_parts:
+                lines.append(f"- hottest_{metric}_by_hardware: {', '.join(hottest_parts)}")
+            if lowest_parts:
+                lines.append(f"- lowest_{metric}_by_hardware: {', '.join(lowest_parts)}")
 
     if nodes_in_query:
         lines.append("- node_details:")

From 12f9dbdb4e0e99dd59d37f0846c2915f00072779 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:07:26 -0300
Subject: [PATCH 392/416] comms: roll atlasbot config

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7ce144c6..1d89335a 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-86
+        checksum/atlasbot-configmap: manual-atlasbot-87
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 845be4efbccd7a5cea1286525ae5c20ee335f2d5 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:35:34 -0300
Subject: [PATCH 393/416] maintenance: suspend ariadne migrate job

---
 services/maintenance/ariadne-migrate-job.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/ariadne-migrate-job.yaml
index b9b1496f..367a1a09 100644
--- a/services/maintenance/ariadne-migrate-job.yaml
+++ b/services/maintenance/ariadne-migrate-job.yaml
@@ -7,6 +7,7 @@ metadata:
   annotations:
     kustomize.toolkit.fluxcd.io/force: "true"
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
@@ -15,6 +16,7 @@ spec:
         app: ariadne-migrate
       annotations:
         vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
         vault.hashicorp.com/role: "maintenance"
         vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
         vault.hashicorp.com/agent-inject-template-ariadne-env.sh: |

From bc5927020285ef319d2d563c2787bb7b0b1ded01 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:48:32 -0300
Subject: [PATCH 394/416] chore: organize one-off jobs

---
 .../kustomization.yaml                        |  2 +-
 services/bstein-dev-home/kustomization.yaml   |  2 +-
 .../migrations/kustomization.yaml             |  2 +-
 .../migrations/portal-migrate-job.yaml        |  7 ++++-
 .../portal-onboarding-e2e-test-job.yaml       |  7 ++++-
 services/comms/kustomization.yaml             | 20 ++++++-------
 .../{ => oneoffs}/bstein-force-leave-job.yaml |  7 ++++-
 .../comms-secrets-ensure-job.yaml             |  7 ++++-
 .../mas-admin-client-secret-ensure-job.yaml   |  7 ++++-
 .../{ => oneoffs}/mas-db-ensure-job.yaml      |  7 ++++-
 .../mas-local-users-ensure-job.yaml           |  7 ++++-
 .../othrys-kick-numeric-job.yaml              |  7 ++++-
 .../synapse-admin-ensure-job.yaml             |  7 ++++-
 .../synapse-seeder-admin-ensure-job.yaml      |  7 ++++-
 .../synapse-signingkey-ensure-job.yaml        |  7 ++++-
 .../{ => oneoffs}/synapse-user-seed-job.yaml  |  7 ++++-
 services/finance/kustomization.yaml           |  2 +-
 .../finance-secrets-ensure-job.yaml           |  7 ++++-
 services/keycloak/kustomization.yaml          | 30 +++++++++----------
 .../actual-oidc-secret-ensure-job.yaml        |  7 ++++-
 .../harbor-oidc-secret-ensure-job.yaml        |  7 ++++-
 .../{ => oneoffs}/ldap-federation-job.yaml    |  7 ++++-
 .../logs-oidc-secret-ensure-job.yaml          |  7 ++++-
 .../{ => oneoffs}/mas-secrets-ensure-job.yaml |  7 ++++-
 ...portal-admin-client-secret-ensure-job.yaml |  7 ++++-
 .../{ => oneoffs}/portal-e2e-client-job.yaml  |  7 ++++-
 ...al-e2e-execute-actions-email-test-job.yaml |  7 ++++-
 .../portal-e2e-target-client-job.yaml         |  7 ++++-
 ...al-e2e-token-exchange-permissions-job.yaml |  7 ++++-
 .../portal-e2e-token-exchange-test-job.yaml   |  7 ++++-
 .../{ => oneoffs}/realm-settings-job.yaml     |  7 ++++-
 .../synapse-oidc-secret-ensure-job.yaml       |  7 ++++-
 .../{ => oneoffs}/user-overrides-job.yaml     |  7 ++++-
 .../vault-oidc-secret-ensure-job.yaml         |  7 ++++-
 services/logging/kustomization.yaml           |  6 ++--
 .../opensearch-dashboards-setup-job.yaml      |  7 ++++-
 .../{ => oneoffs}/opensearch-ism-job.yaml     |  7 ++++-
 .../opensearch-observability-setup-job.yaml   |  7 ++++-
 services/mailu/kustomization.yaml             |  2 +-
 .../mailu/{ => oneoffs}/mailu-sync-job.yaml   |  7 ++++-
 services/maintenance/kustomization.yaml       |  4 +--
 .../{ => oneoffs}/ariadne-migrate-job.yaml    |  6 +++-
 .../k3s-traefik-cleanup-job.yaml              |  7 ++++-
 services/monitoring/kustomization.yaml        |  4 +--
 .../{ => oneoffs}/grafana-org-bootstrap.yaml  |  7 ++++-
 .../grafana-user-dedupe-job.yaml              |  7 ++++-
 46 files changed, 252 insertions(+), 73 deletions(-)
 rename services/bstein-dev-home/{ => oneoffs}/migrations/kustomization.yaml (66%)
 rename services/bstein-dev-home/{ => oneoffs}/migrations/portal-migrate-job.yaml (78%)
 rename services/bstein-dev-home/{ => oneoffs}/portal-onboarding-e2e-test-job.yaml (89%)
 rename services/comms/{ => oneoffs}/bstein-force-leave-job.yaml (96%)
 rename services/comms/{ => oneoffs}/comms-secrets-ensure-job.yaml (92%)
 rename services/comms/{ => oneoffs}/mas-admin-client-secret-ensure-job.yaml (90%)
 rename services/comms/{ => oneoffs}/mas-db-ensure-job.yaml (91%)
 rename services/comms/{ => oneoffs}/mas-local-users-ensure-job.yaml (97%)
 rename services/comms/{ => oneoffs}/othrys-kick-numeric-job.yaml (96%)
 rename services/comms/{ => oneoffs}/synapse-admin-ensure-job.yaml (96%)
 rename services/comms/{ => oneoffs}/synapse-seeder-admin-ensure-job.yaml (93%)
 rename services/comms/{ => oneoffs}/synapse-signingkey-ensure-job.yaml (88%)
 rename services/comms/{ => oneoffs}/synapse-user-seed-job.yaml (96%)
 rename services/finance/{ => oneoffs}/finance-secrets-ensure-job.yaml (83%)
 rename services/keycloak/{ => oneoffs}/actual-oidc-secret-ensure-job.yaml (83%)
 rename services/keycloak/{ => oneoffs}/harbor-oidc-secret-ensure-job.yaml (83%)
 rename services/keycloak/{ => oneoffs}/ldap-federation-job.yaml (98%)
 rename services/keycloak/{ => oneoffs}/logs-oidc-secret-ensure-job.yaml (94%)
 rename services/keycloak/{ => oneoffs}/mas-secrets-ensure-job.yaml (95%)
 rename services/keycloak/{ => oneoffs}/portal-admin-client-secret-ensure-job.yaml (96%)
 rename services/keycloak/{ => oneoffs}/portal-e2e-client-job.yaml (97%)
 rename services/keycloak/{ => oneoffs}/portal-e2e-execute-actions-email-test-job.yaml (89%)
 rename services/keycloak/{ => oneoffs}/portal-e2e-target-client-job.yaml (95%)
 rename services/keycloak/{ => oneoffs}/portal-e2e-token-exchange-permissions-job.yaml (97%)
 rename services/keycloak/{ => oneoffs}/portal-e2e-token-exchange-test-job.yaml (89%)
 rename services/keycloak/{ => oneoffs}/realm-settings-job.yaml (98%)
 rename services/keycloak/{ => oneoffs}/synapse-oidc-secret-ensure-job.yaml (92%)
 rename services/keycloak/{ => oneoffs}/user-overrides-job.yaml (96%)
 rename services/keycloak/{ => oneoffs}/vault-oidc-secret-ensure-job.yaml (83%)
 rename services/logging/{ => oneoffs}/opensearch-dashboards-setup-job.yaml (88%)
 rename services/logging/{ => oneoffs}/opensearch-ism-job.yaml (91%)
 rename services/logging/{ => oneoffs}/opensearch-observability-setup-job.yaml (76%)
 rename services/mailu/{ => oneoffs}/mailu-sync-job.yaml (93%)
 rename services/maintenance/{ => oneoffs}/ariadne-migrate-job.yaml (82%)
 rename services/maintenance/{ => oneoffs}/k3s-traefik-cleanup-job.yaml (77%)
 rename services/monitoring/{ => oneoffs}/grafana-org-bootstrap.yaml (93%)
 rename services/monitoring/{ => oneoffs}/grafana-user-dedupe-job.yaml (94%)

diff --git a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
index da61b2d1..ff97f73b 100644
--- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
@@ -6,7 +6,7 @@ metadata:
   namespace: flux-system
 spec:
   interval: 10m
-  path: ./services/bstein-dev-home/migrations
+  path: ./services/bstein-dev-home/oneoffs/migrations
   prune: true
   force: true
   sourceRef:
diff --git a/services/bstein-dev-home/kustomization.yaml b/services/bstein-dev-home/kustomization.yaml
index a8132417..f62fb171 100644
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@@ -16,7 +16,7 @@ resources:
   - backend-deployment.yaml
   - backend-service.yaml
   - vaultwarden-cred-sync-cronjob.yaml
-  - portal-onboarding-e2e-test-job.yaml
+  - oneoffs/portal-onboarding-e2e-test-job.yaml
   - ingress.yaml
 images:
   - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
diff --git a/services/bstein-dev-home/migrations/kustomization.yaml b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
similarity index 66%
rename from services/bstein-dev-home/migrations/kustomization.yaml
rename to services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
index 067665bc..1d1dfc82 100644
--- a/services/bstein-dev-home/migrations/kustomization.yaml
+++ b/services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
@@ -1,4 +1,4 @@
-# services/bstein-dev-home/migrations/kustomization.yaml
+# services/bstein-dev-home/oneoffs/migrations/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: bstein-dev-home
diff --git a/services/bstein-dev-home/migrations/portal-migrate-job.yaml b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
similarity index 78%
rename from services/bstein-dev-home/migrations/portal-migrate-job.yaml
rename to services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
index 9d052546..1f7e092b 100644
--- a/services/bstein-dev-home/migrations/portal-migrate-job.yaml
+++ b/services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
@@ -1,4 +1,8 @@
-# services/bstein-dev-home/migrations/portal-migrate-job.yaml
+# services/bstein-dev-home/oneoffs/migrations/portal-migrate-job.yaml
+# One-off job for bstein-dev-home/bstein-dev-home-portal-migrate-36.
+# Purpose: bstein dev home portal migrate 36 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
@@ -7,6 +11,7 @@ metadata:
   annotations:
     kustomize.toolkit.fluxcd.io/force: "true"
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
similarity index 89%
rename from services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
rename to services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
index 681e89d2..9923499b 100644
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
@@ -1,10 +1,15 @@
-# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+# services/bstein-dev-home/oneoffs/portal-onboarding-e2e-test-job.yaml
+# One-off job for bstein-dev-home/portal-onboarding-e2e-test-27.
+# Purpose: portal onboarding e2e test 27 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: portal-onboarding-e2e-test-27
   namespace: bstein-dev-home
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml
index 01d7be5c..969ca586 100644
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@@ -22,24 +22,24 @@ resources:
   - mas-db-ensure-rbac.yaml
   - synapse-signingkey-ensure-rbac.yaml
   - vault-sync-deployment.yaml
-  - mas-admin-client-secret-ensure-job.yaml
-  - mas-db-ensure-job.yaml
-  - comms-secrets-ensure-job.yaml
-  - synapse-admin-ensure-job.yaml
-  - synapse-signingkey-ensure-job.yaml
-  - synapse-seeder-admin-ensure-job.yaml
-  - synapse-user-seed-job.yaml
-  - mas-local-users-ensure-job.yaml
+  - oneoffs/mas-admin-client-secret-ensure-job.yaml
+  - oneoffs/mas-db-ensure-job.yaml
+  - oneoffs/comms-secrets-ensure-job.yaml
+  - oneoffs/synapse-admin-ensure-job.yaml
+  - oneoffs/synapse-signingkey-ensure-job.yaml
+  - oneoffs/synapse-seeder-admin-ensure-job.yaml
+  - oneoffs/synapse-user-seed-job.yaml
+  - oneoffs/mas-local-users-ensure-job.yaml
   - mas-deployment.yaml
   - livekit-token-deployment.yaml
   - livekit.yaml
   - coturn.yaml
   - seed-othrys-room.yaml
   - guest-name-job.yaml
-  - othrys-kick-numeric-job.yaml
+  - oneoffs/othrys-kick-numeric-job.yaml
   - pin-othrys-job.yaml
   - reset-othrys-room-job.yaml
-  - bstein-force-leave-job.yaml
+  - oneoffs/bstein-force-leave-job.yaml
   - livekit-ingress.yaml
   - livekit-middlewares.yaml
   - matrix-ingress.yaml
diff --git a/services/comms/bstein-force-leave-job.yaml b/services/comms/oneoffs/bstein-force-leave-job.yaml
similarity index 96%
rename from services/comms/bstein-force-leave-job.yaml
rename to services/comms/oneoffs/bstein-force-leave-job.yaml
index 0286f8c8..7efe826e 100644
--- a/services/comms/bstein-force-leave-job.yaml
+++ b/services/comms/oneoffs/bstein-force-leave-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/bstein-force-leave-job.yaml
+# services/comms/oneoffs/bstein-force-leave-job.yaml
+# One-off job for comms/bstein-leave-rooms-12.
+# Purpose: bstein leave rooms 12 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: bstein-leave-rooms-12
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/comms/comms-secrets-ensure-job.yaml b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
similarity index 92%
rename from services/comms/comms-secrets-ensure-job.yaml
rename to services/comms/oneoffs/comms-secrets-ensure-job.yaml
index 52904cc9..35ca73c5 100644
--- a/services/comms/comms-secrets-ensure-job.yaml
+++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/comms-secrets-ensure-job.yaml
+# services/comms/oneoffs/comms-secrets-ensure-job.yaml
+# One-off job for comms/comms-secrets-ensure-7.
+# Purpose: comms secrets ensure 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: comms-secrets-ensure-7
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/comms/mas-admin-client-secret-ensure-job.yaml b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
similarity index 90%
rename from services/comms/mas-admin-client-secret-ensure-job.yaml
rename to services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
index 7b05ccae..e1d54589 100644
--- a/services/comms/mas-admin-client-secret-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
@@ -1,4 +1,8 @@
-# services/comms/mas-admin-client-secret-ensure-job.yaml
+# services/comms/oneoffs/mas-admin-client-secret-ensure-job.yaml
+# One-off job for comms/mas-admin-client-secret-writer.
+# Purpose: mas admin client secret writer (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: v1
 kind: ServiceAccount
 metadata:
@@ -41,6 +45,7 @@ metadata:
   name: mas-admin-client-secret-ensure-11
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     spec:
diff --git a/services/comms/mas-db-ensure-job.yaml b/services/comms/oneoffs/mas-db-ensure-job.yaml
similarity index 91%
rename from services/comms/mas-db-ensure-job.yaml
rename to services/comms/oneoffs/mas-db-ensure-job.yaml
index 56707a9b..44137da8 100644
--- a/services/comms/mas-db-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-db-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/mas-db-ensure-job.yaml
+# services/comms/oneoffs/mas-db-ensure-job.yaml
+# One-off job for comms/mas-db-ensure-22.
+# Purpose: mas db ensure 22 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: mas-db-ensure-22
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 600
   template:
diff --git a/services/comms/mas-local-users-ensure-job.yaml b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
similarity index 97%
rename from services/comms/mas-local-users-ensure-job.yaml
rename to services/comms/oneoffs/mas-local-users-ensure-job.yaml
index 636ee5bb..7b510727 100644
--- a/services/comms/mas-local-users-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/mas-local-users-ensure-job.yaml
+# services/comms/oneoffs/mas-local-users-ensure-job.yaml
+# One-off job for comms/mas-local-users-ensure-18.
+# Purpose: mas local users ensure 18 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: mas-local-users-ensure-18
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/comms/othrys-kick-numeric-job.yaml b/services/comms/oneoffs/othrys-kick-numeric-job.yaml
similarity index 96%
rename from services/comms/othrys-kick-numeric-job.yaml
rename to services/comms/oneoffs/othrys-kick-numeric-job.yaml
index 0d3914a5..e38a6bb6 100644
--- a/services/comms/othrys-kick-numeric-job.yaml
+++ b/services/comms/oneoffs/othrys-kick-numeric-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/othrys-kick-numeric-job.yaml
+# services/comms/oneoffs/othrys-kick-numeric-job.yaml
+# One-off job for comms/othrys-kick-numeric-8.
+# Purpose: othrys kick numeric 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: othrys-kick-numeric-8
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/comms/synapse-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
similarity index 96%
rename from services/comms/synapse-admin-ensure-job.yaml
rename to services/comms/oneoffs/synapse-admin-ensure-job.yaml
index 5ddf60c4..95bc9f2a 100644
--- a/services/comms/synapse-admin-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-admin-ensure-job.yaml
+# services/comms/oneoffs/synapse-admin-ensure-job.yaml
+# One-off job for comms/synapse-admin-ensure-3.
+# Purpose: synapse admin ensure 3 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: synapse-admin-ensure-3
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/comms/synapse-seeder-admin-ensure-job.yaml b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
similarity index 93%
rename from services/comms/synapse-seeder-admin-ensure-job.yaml
rename to services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
index 5d2d4225..1d8972e8 100644
--- a/services/comms/synapse-seeder-admin-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-seeder-admin-ensure-job.yaml
+# services/comms/oneoffs/synapse-seeder-admin-ensure-job.yaml
+# One-off job for comms/synapse-seeder-admin-ensure-9.
+# Purpose: synapse seeder admin ensure 9 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: synapse-seeder-admin-ensure-9
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     metadata:
diff --git a/services/comms/synapse-signingkey-ensure-job.yaml b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
similarity index 88%
rename from services/comms/synapse-signingkey-ensure-job.yaml
rename to services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
index 402a820a..bbc4595b 100644
--- a/services/comms/synapse-signingkey-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-signingkey-ensure-job.yaml
+# services/comms/oneoffs/synapse-signingkey-ensure-job.yaml
+# One-off job for comms/othrys-synapse-signingkey-ensure-7.
+# Purpose: othrys synapse signingkey ensure 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: othrys-synapse-signingkey-ensure-7
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     spec:
diff --git a/services/comms/synapse-user-seed-job.yaml b/services/comms/oneoffs/synapse-user-seed-job.yaml
similarity index 96%
rename from services/comms/synapse-user-seed-job.yaml
rename to services/comms/oneoffs/synapse-user-seed-job.yaml
index aab88c3b..a732739a 100644
--- a/services/comms/synapse-user-seed-job.yaml
+++ b/services/comms/oneoffs/synapse-user-seed-job.yaml
@@ -1,10 +1,15 @@
-# services/comms/synapse-user-seed-job.yaml
+# services/comms/oneoffs/synapse-user-seed-job.yaml
+# One-off job for comms/synapse-user-seed-8.
+# Purpose: synapse user seed 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: synapse-user-seed-8
   namespace: comms
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/finance/kustomization.yaml b/services/finance/kustomization.yaml
index e4c414f5..1559f5c8 100644
--- a/services/finance/kustomization.yaml
+++ b/services/finance/kustomization.yaml
@@ -9,7 +9,7 @@ resources:
   - finance-secrets-ensure-rbac.yaml
   - actual-budget-data-pvc.yaml
   - firefly-storage-pvc.yaml
-  - finance-secrets-ensure-job.yaml
+  - oneoffs/finance-secrets-ensure-job.yaml
   - actual-budget-deployment.yaml
   - firefly-deployment.yaml
   - firefly-user-sync-cronjob.yaml
diff --git a/services/finance/finance-secrets-ensure-job.yaml b/services/finance/oneoffs/finance-secrets-ensure-job.yaml
similarity index 83%
rename from services/finance/finance-secrets-ensure-job.yaml
rename to services/finance/oneoffs/finance-secrets-ensure-job.yaml
index 67f06cb5..e8c8f588 100644
--- a/services/finance/finance-secrets-ensure-job.yaml
+++ b/services/finance/oneoffs/finance-secrets-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/finance/finance-secrets-ensure-job.yaml
+# services/finance/oneoffs/finance-secrets-ensure-job.yaml
+# One-off job for finance/finance-secrets-ensure-5.
+# Purpose: finance secrets ensure 5 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: finance-secrets-ensure-5
   namespace: finance
 spec:
+  suspend: true
   backoffLimit: 1
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml
index 6030a821..60278910 100644
--- a/services/keycloak/kustomization.yaml
+++ b/services/keycloak/kustomization.yaml
@@ -10,21 +10,21 @@ resources:
   - secretproviderclass.yaml
   - vault-sync-deployment.yaml
   - deployment.yaml
-  - realm-settings-job.yaml
-  - portal-admin-client-secret-ensure-job.yaml
-  - portal-e2e-client-job.yaml
-  - portal-e2e-target-client-job.yaml
-  - portal-e2e-token-exchange-permissions-job.yaml
-  - portal-e2e-token-exchange-test-job.yaml
-  - portal-e2e-execute-actions-email-test-job.yaml
-  - ldap-federation-job.yaml
-  - user-overrides-job.yaml
-  - mas-secrets-ensure-job.yaml
-  - synapse-oidc-secret-ensure-job.yaml
-  - logs-oidc-secret-ensure-job.yaml
-  - harbor-oidc-secret-ensure-job.yaml
-  - vault-oidc-secret-ensure-job.yaml
-  - actual-oidc-secret-ensure-job.yaml
+  - oneoffs/realm-settings-job.yaml
+  - oneoffs/portal-admin-client-secret-ensure-job.yaml
+  - oneoffs/portal-e2e-client-job.yaml
+  - oneoffs/portal-e2e-target-client-job.yaml
+  - oneoffs/portal-e2e-token-exchange-permissions-job.yaml
+  - oneoffs/portal-e2e-token-exchange-test-job.yaml
+  - oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+  - oneoffs/ldap-federation-job.yaml
+  - oneoffs/user-overrides-job.yaml
+  - oneoffs/mas-secrets-ensure-job.yaml
+  - oneoffs/synapse-oidc-secret-ensure-job.yaml
+  - oneoffs/logs-oidc-secret-ensure-job.yaml
+  - oneoffs/harbor-oidc-secret-ensure-job.yaml
+  - oneoffs/vault-oidc-secret-ensure-job.yaml
+  - oneoffs/actual-oidc-secret-ensure-job.yaml
   - service.yaml
   - ingress.yaml
 generatorOptions:
diff --git a/services/keycloak/actual-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
similarity index 83%
rename from services/keycloak/actual-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
index 3dadb520..d4da1f1f 100644
--- a/services/keycloak/actual-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/actual-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/actual-oidc-secret-ensure-job.yaml
+# One-off job for sso/actual-oidc-secret-ensure-3.
+# Purpose: actual oidc secret ensure 3 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: actual-oidc-secret-ensure-3
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/harbor-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
similarity index 83%
rename from services/keycloak/harbor-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
index 87de4632..c368241b 100644
--- a/services/keycloak/harbor-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/harbor-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/harbor-oidc-secret-ensure-job.yaml
+# One-off job for sso/harbor-oidc-secret-ensure-10.
+# Purpose: harbor oidc secret ensure 10 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: harbor-oidc-secret-ensure-10
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/ldap-federation-job.yaml b/services/keycloak/oneoffs/ldap-federation-job.yaml
similarity index 98%
rename from services/keycloak/ldap-federation-job.yaml
rename to services/keycloak/oneoffs/ldap-federation-job.yaml
index 3c3f1c19..9e9a5f9b 100644
--- a/services/keycloak/ldap-federation-job.yaml
+++ b/services/keycloak/oneoffs/ldap-federation-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/ldap-federation-job.yaml
+# services/keycloak/oneoffs/ldap-federation-job.yaml
+# One-off job for sso/keycloak-ldap-federation-12.
+# Purpose: keycloak ldap federation 12 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-ldap-federation-12
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     metadata:
diff --git a/services/keycloak/logs-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
similarity index 94%
rename from services/keycloak/logs-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
index 14e80df5..bce9e5b4 100644
--- a/services/keycloak/logs-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/logs-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/logs-oidc-secret-ensure-job.yaml
+# One-off job for sso/logs-oidc-secret-ensure-10.
+# Purpose: logs oidc secret ensure 10 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: logs-oidc-secret-ensure-10
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/mas-secrets-ensure-job.yaml b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
similarity index 95%
rename from services/keycloak/mas-secrets-ensure-job.yaml
rename to services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
index 24c9e048..c3bd1be0 100644
--- a/services/keycloak/mas-secrets-ensure-job.yaml
+++ b/services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
@@ -1,4 +1,8 @@
-# services/keycloak/mas-secrets-ensure-job.yaml
+# services/keycloak/oneoffs/mas-secrets-ensure-job.yaml
+# One-off job for sso/mas-secrets-ensure.
+# Purpose: mas secrets ensure (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: v1
 kind: ServiceAccount
 metadata:
@@ -13,6 +17,7 @@ metadata:
   name: mas-secrets-ensure-21
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/portal-admin-client-secret-ensure-job.yaml b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
similarity index 96%
rename from services/keycloak/portal-admin-client-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
index 90dd4b71..1d3e7f37 100644
--- a/services/keycloak/portal-admin-client-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-admin-client-secret-ensure-job.yaml
+# services/keycloak/oneoffs/portal-admin-client-secret-ensure-job.yaml
+# One-off job for sso/keycloak-portal-admin-secret-ensure-4.
+# Purpose: keycloak portal admin secret ensure 4 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-admin-secret-ensure-4
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-client-job.yaml
similarity index 97%
rename from services/keycloak/portal-e2e-client-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-client-job.yaml
index 4e0c0062..274dd27b 100644
--- a/services/keycloak/portal-e2e-client-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-client-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-client-job.yaml
+# services/keycloak/oneoffs/portal-e2e-client-job.yaml
+# One-off job for sso/keycloak-portal-e2e-client-8.
+# Purpose: keycloak portal e2e client 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-client-8
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
similarity index 89%
rename from services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
index 35f79a6b..518d839c 100644
--- a/services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-execute-actions-email-test-job.yaml
+# services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
+# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-execute-actions-email-14
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 3
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-target-client-job.yaml b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
similarity index 95%
rename from services/keycloak/portal-e2e-target-client-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
index 196b48bd..900d0290 100644
--- a/services/keycloak/portal-e2e-target-client-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-target-client-job.yaml
+# services/keycloak/oneoffs/portal-e2e-target-client-job.yaml
+# One-off job for sso/keycloak-portal-e2e-target-7.
+# Purpose: keycloak portal e2e target 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-target-7
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
similarity index 97%
rename from services/keycloak/portal-e2e-token-exchange-permissions-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
index 647b8f9b..0d41b476 100644
--- a/services/keycloak/portal-e2e-token-exchange-permissions-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-token-exchange-permissions-job.yaml
+# services/keycloak/oneoffs/portal-e2e-token-exchange-permissions-job.yaml
+# One-off job for sso/keycloak-portal-e2e-token-exchange-permissions-11.
+# Purpose: keycloak portal e2e token exchange permissions 11 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-token-exchange-permissions-11
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 6
   template:
     metadata:
diff --git a/services/keycloak/portal-e2e-token-exchange-test-job.yaml b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
similarity index 89%
rename from services/keycloak/portal-e2e-token-exchange-test-job.yaml
rename to services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
index edd7555e..eb05e09c 100644
--- a/services/keycloak/portal-e2e-token-exchange-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/portal-e2e-token-exchange-test-job.yaml
+# services/keycloak/oneoffs/portal-e2e-token-exchange-test-job.yaml
+# One-off job for sso/keycloak-portal-e2e-token-exchange-test-7.
+# Purpose: keycloak portal e2e token exchange test 7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-portal-e2e-token-exchange-test-7
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 6
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/realm-settings-job.yaml b/services/keycloak/oneoffs/realm-settings-job.yaml
similarity index 98%
rename from services/keycloak/realm-settings-job.yaml
rename to services/keycloak/oneoffs/realm-settings-job.yaml
index 9265ca3e..ea88d83f 100644
--- a/services/keycloak/realm-settings-job.yaml
+++ b/services/keycloak/oneoffs/realm-settings-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/realm-settings-job.yaml
+# services/keycloak/oneoffs/realm-settings-job.yaml
+# One-off job for sso/keycloak-realm-settings-36.
+# Purpose: keycloak realm settings 36 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-realm-settings-36
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/synapse-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
similarity index 92%
rename from services/keycloak/synapse-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
index e808e7e0..15b7a312 100644
--- a/services/keycloak/synapse-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/synapse-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/synapse-oidc-secret-ensure-job.yaml
+# One-off job for sso/synapse-oidc-secret-ensure-10.
+# Purpose: synapse oidc secret ensure 10 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: synapse-oidc-secret-ensure-10
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/keycloak/user-overrides-job.yaml b/services/keycloak/oneoffs/user-overrides-job.yaml
similarity index 96%
rename from services/keycloak/user-overrides-job.yaml
rename to services/keycloak/oneoffs/user-overrides-job.yaml
index 7623c843..0d52d6d3 100644
--- a/services/keycloak/user-overrides-job.yaml
+++ b/services/keycloak/oneoffs/user-overrides-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/user-overrides-job.yaml
+# services/keycloak/oneoffs/user-overrides-job.yaml
+# One-off job for sso/keycloak-user-overrides-9.
+# Purpose: keycloak user overrides 9 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: keycloak-user-overrides-9
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   template:
     metadata:
diff --git a/services/keycloak/vault-oidc-secret-ensure-job.yaml b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
similarity index 83%
rename from services/keycloak/vault-oidc-secret-ensure-job.yaml
rename to services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
index 3aa3ca55..a76c52e9 100644
--- a/services/keycloak/vault-oidc-secret-ensure-job.yaml
+++ b/services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
@@ -1,10 +1,15 @@
-# services/keycloak/vault-oidc-secret-ensure-job.yaml
+# services/keycloak/oneoffs/vault-oidc-secret-ensure-job.yaml
+# One-off job for sso/vault-oidc-secret-ensure-8.
+# Purpose: vault oidc secret ensure 8 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: vault-oidc-secret-ensure-8
   namespace: sso
 spec:
+  suspend: true
   backoffLimit: 0
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml
index 08c73a8d..dc487155 100644
--- a/services/logging/kustomization.yaml
+++ b/services/logging/kustomization.yaml
@@ -15,9 +15,9 @@ resources:
   - opensearch-dashboards-helmrelease.yaml
   - data-prepper-helmrelease.yaml
   - otel-collector-helmrelease.yaml
-  - opensearch-ism-job.yaml
-  - opensearch-dashboards-setup-job.yaml
-  - opensearch-observability-setup-job.yaml
+  - oneoffs/opensearch-ism-job.yaml
+  - oneoffs/opensearch-dashboards-setup-job.yaml
+  - oneoffs/opensearch-observability-setup-job.yaml
   - opensearch-prune-cronjob.yaml
   - fluent-bit-helmrelease.yaml
   - node-log-rotation-daemonset.yaml
diff --git a/services/logging/opensearch-dashboards-setup-job.yaml b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
similarity index 88%
rename from services/logging/opensearch-dashboards-setup-job.yaml
rename to services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
index 06149d79..1d1a9b68 100644
--- a/services/logging/opensearch-dashboards-setup-job.yaml
+++ b/services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
@@ -1,10 +1,15 @@
-# services/logging/opensearch-dashboards-setup-job.yaml
+# services/logging/oneoffs/opensearch-dashboards-setup-job.yaml
+# One-off job for logging/opensearch-dashboards-setup-4.
+# Purpose: opensearch dashboards setup 4 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: opensearch-dashboards-setup-4
   namespace: logging
 spec:
+  suspend: true
   backoffLimit: 3
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/logging/opensearch-ism-job.yaml b/services/logging/oneoffs/opensearch-ism-job.yaml
similarity index 91%
rename from services/logging/opensearch-ism-job.yaml
rename to services/logging/oneoffs/opensearch-ism-job.yaml
index 3313571b..476bca7a 100644
--- a/services/logging/opensearch-ism-job.yaml
+++ b/services/logging/oneoffs/opensearch-ism-job.yaml
@@ -1,10 +1,15 @@
-# services/logging/opensearch-ism-job.yaml
+# services/logging/oneoffs/opensearch-ism-job.yaml
+# One-off job for logging/opensearch-ism-setup-5.
+# Purpose: opensearch ism setup 5 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: opensearch-ism-setup-5
   namespace: logging
 spec:
+  suspend: true
   backoffLimit: 3
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/logging/opensearch-observability-setup-job.yaml b/services/logging/oneoffs/opensearch-observability-setup-job.yaml
similarity index 76%
rename from services/logging/opensearch-observability-setup-job.yaml
rename to services/logging/oneoffs/opensearch-observability-setup-job.yaml
index e4590fb5..6caa0765 100644
--- a/services/logging/opensearch-observability-setup-job.yaml
+++ b/services/logging/oneoffs/opensearch-observability-setup-job.yaml
@@ -1,10 +1,15 @@
-# services/logging/opensearch-observability-setup-job.yaml
+# services/logging/oneoffs/opensearch-observability-setup-job.yaml
+# One-off job for logging/opensearch-observability-setup-2.
+# Purpose: opensearch observability setup 2 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: opensearch-observability-setup-2
   namespace: logging
 spec:
+  suspend: true
   backoffLimit: 3
   ttlSecondsAfterFinished: 3600
   template:
diff --git a/services/mailu/kustomization.yaml b/services/mailu/kustomization.yaml
index 7447f24a..3e0494ee 100644
--- a/services/mailu/kustomization.yaml
+++ b/services/mailu/kustomization.yaml
@@ -13,7 +13,7 @@ resources:
   - unbound-configmap.yaml
   - serverstransport.yaml
   - ingressroute.yaml
-  - mailu-sync-job.yaml
+  - oneoffs/mailu-sync-job.yaml
   - mailu-sync-cronjob.yaml
   - front-lb.yaml
 
diff --git a/services/mailu/mailu-sync-job.yaml b/services/mailu/oneoffs/mailu-sync-job.yaml
similarity index 93%
rename from services/mailu/mailu-sync-job.yaml
rename to services/mailu/oneoffs/mailu-sync-job.yaml
index 8589e9ee..38648acc 100644
--- a/services/mailu/mailu-sync-job.yaml
+++ b/services/mailu/oneoffs/mailu-sync-job.yaml
@@ -1,10 +1,15 @@
-# services/mailu/mailu-sync-job.yaml
+# services/mailu/oneoffs/mailu-sync-job.yaml
+# One-off job for mailu-mailserver/mailu-sync-9.
+# Purpose: mailu sync 9 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: mailu-sync-9
   namespace: mailu-mailserver
 spec:
+  suspend: true
   template:
     metadata:
       annotations:
diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml
index a1ca5831..19b2ba98 100644
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@@ -14,10 +14,10 @@ resources:
   - node-nofile-serviceaccount.yaml
   - pod-cleaner-rbac.yaml
   - ariadne-deployment.yaml
-  - ariadne-migrate-job.yaml
+  - oneoffs/ariadne-migrate-job.yaml
   - ariadne-service.yaml
   - disable-k3s-traefik-daemonset.yaml
-  - k3s-traefik-cleanup-job.yaml
+  - oneoffs/k3s-traefik-cleanup-job.yaml
   - node-nofile-daemonset.yaml
   - k3s-agent-restart-daemonset.yaml
   - pod-cleaner-cronjob.yaml
diff --git a/services/maintenance/ariadne-migrate-job.yaml b/services/maintenance/oneoffs/ariadne-migrate-job.yaml
similarity index 82%
rename from services/maintenance/ariadne-migrate-job.yaml
rename to services/maintenance/oneoffs/ariadne-migrate-job.yaml
index 367a1a09..ecac68d4 100644
--- a/services/maintenance/ariadne-migrate-job.yaml
+++ b/services/maintenance/oneoffs/ariadne-migrate-job.yaml
@@ -1,4 +1,8 @@
-# services/maintenance/ariadne-migrate-job.yaml
+# services/maintenance/oneoffs/ariadne-migrate-job.yaml
+# One-off job for maintenance/ariadne-migrate-2.
+# Purpose: ariadne migrate 2 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
diff --git a/services/maintenance/k3s-traefik-cleanup-job.yaml b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
similarity index 77%
rename from services/maintenance/k3s-traefik-cleanup-job.yaml
rename to services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
index d5d12a65..2c365a95 100644
--- a/services/maintenance/k3s-traefik-cleanup-job.yaml
+++ b/services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
@@ -1,10 +1,15 @@
-# services/maintenance/k3s-traefik-cleanup-job.yaml
+# services/maintenance/oneoffs/k3s-traefik-cleanup-job.yaml
+# One-off job for maintenance/k3s-traefik-cleanup-2.
+# Purpose: k3s traefik cleanup 2 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: k3s-traefik-cleanup-2
   namespace: maintenance
 spec:
+  suspend: true
   backoffLimit: 1
   template:
     spec:
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 59530390..23c1595a 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -23,8 +23,8 @@ resources:
   - grafana-alerting-config.yaml
   - grafana-folders.yaml
   - helmrelease.yaml
-  - grafana-org-bootstrap.yaml
-  - grafana-user-dedupe-job.yaml
+  - oneoffs/grafana-org-bootstrap.yaml
+  - oneoffs/grafana-user-dedupe-job.yaml
 
 configMapGenerator:
   - name: postmark-exporter-script
diff --git a/services/monitoring/grafana-org-bootstrap.yaml b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml
similarity index 93%
rename from services/monitoring/grafana-org-bootstrap.yaml
rename to services/monitoring/oneoffs/grafana-org-bootstrap.yaml
index f1d40755..6f824cc5 100644
--- a/services/monitoring/grafana-org-bootstrap.yaml
+++ b/services/monitoring/oneoffs/grafana-org-bootstrap.yaml
@@ -1,10 +1,15 @@
-# services/monitoring/grafana-org-bootstrap.yaml
+# services/monitoring/oneoffs/grafana-org-bootstrap.yaml
+# One-off job for monitoring/grafana-org-bootstrap-3.
+# Purpose: grafana org bootstrap 3 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: grafana-org-bootstrap-3
   namespace: monitoring
 spec:
+  suspend: true
   backoffLimit: 2
   template:
     metadata:
diff --git a/services/monitoring/grafana-user-dedupe-job.yaml b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
similarity index 94%
rename from services/monitoring/grafana-user-dedupe-job.yaml
rename to services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
index 8ab1a665..8194f186 100644
--- a/services/monitoring/grafana-user-dedupe-job.yaml
+++ b/services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
@@ -1,10 +1,15 @@
-# services/monitoring/grafana-user-dedupe-job.yaml
+# services/monitoring/oneoffs/grafana-user-dedupe-job.yaml
+# One-off job for monitoring/grafana-user-dedupe-api-v7.
+# Purpose: grafana user dedupe api v7 (see container args/env in this file).
+# Run by setting spec.suspend to false, reconcile, then set it back to true.
+# Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
   name: grafana-user-dedupe-api-v7
   namespace: monitoring
 spec:
+  suspend: true
   backoffLimit: 1
   template:
     metadata:

From c8a9761ed3e9f6af9cb93cb5cf43d43738ec147e Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:52:23 -0300
Subject: [PATCH 395/416] atlasbot: simplify fast path

---
 services/comms/scripts/atlasbot/bot.py | 32 ++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 50fed4b6..d0d46efe 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3605,10 +3605,33 @@ def _open_ended_multi(
 
 def _open_ended_total_steps(mode: str) -> int:
     if mode == "fast":
-        return 4
+        return 2
     return 7
 
 
+def _open_ended_fast_single(
+    prompt: str,
+    *,
+    fact_pack: str,
+    history_lines: list[str],
+    state: ThoughtState | None = None,
+    model: str,
+) -> str:
+    if state:
+        state.update("drafting", step=2, note="summarizing")
+    context = fact_pack
+    reply = _ollama_call(
+        ("atlasbot_fast", "atlasbot_fast"),
+        prompt,
+        context=context,
+        use_history=False,
+        model=model,
+    )
+    if state:
+        state.update("done", step=_open_ended_total_steps("fast"))
+    return _ensure_scores(reply)
+
+
 def _open_ended_fast(
     prompt: str,
     *,
@@ -3618,14 +3641,13 @@ def _open_ended_fast(
     history_lines: list[str],
     state: ThoughtState | None = None,
 ) -> str:
-    return _open_ended_multi(
+    model = _model_for_mode("fast")
+    return _open_ended_fast_single(
         prompt,
         fact_pack=fact_pack,
-        fact_lines=fact_lines,
-        fact_meta=fact_meta,
         history_lines=history_lines,
-        mode="fast",
         state=state,
+        model=model,
     )
 
 

From fdfc1f5857892e8b1e51316b38f7e46179c532db Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:52:40 -0300
Subject: [PATCH 396/416] comms: roll atlasbot config

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 1d89335a..8607858a 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-87
+        checksum/atlasbot-configmap: manual-atlasbot-88
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From a64bec005350ec3d09c3445e6cb8120965debbdb Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:58:07 -0300
Subject: [PATCH 397/416] atlasbot: tighten fast facts

---
 services/comms/scripts/atlasbot/bot.py | 39 +++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index d0d46efe..c44c7da3 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3605,10 +3605,25 @@ def _open_ended_multi(
 
 def _open_ended_total_steps(mode: str) -> int:
     if mode == "fast":
-        return 2
+        return 3
     return 7
 
 
+def _fast_fact_lines(
+    fact_lines: list[str],
+    fact_meta: dict[str, dict[str, Any]],
+    fact_ids: list[str],
+) -> list[str]:
+    if not fact_ids:
+        return fact_lines
+    selected = [
+        line
+        for line in fact_lines
+        if fact_meta.get(line, {}).get("id") in set(fact_ids)
+    ]
+    return selected or fact_lines
+
+
 def _open_ended_fast_single(
     prompt: str,
     *,
@@ -3642,6 +3657,27 @@ def _open_ended_fast(
     state: ThoughtState | None = None,
 ) -> str:
     model = _model_for_mode("fast")
+    if state:
+        state.update("selecting", step=2, note="picking key facts")
+    subjective = _is_subjective_query(prompt)
+    focus_tags = _preferred_tags_for_prompt(prompt)
+    if not focus_tags and subjective:
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+    primary_ids = _open_ended_select_facts(
+        prompt,
+        fact_pack=fact_pack,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        focus_tags=focus_tags,
+        avoid_fact_ids=[],
+        count=3,
+        subjective=subjective,
+        state=state,
+        step=2,
+        model=model,
+    )
+    selected_lines = _fast_fact_lines(fact_lines, fact_meta, primary_ids)
+    fact_pack = _fact_pack_text(selected_lines, fact_meta)
     return _open_ended_fast_single(
         prompt,
         fact_pack=fact_pack,
@@ -4092,6 +4128,7 @@ def _ollama_call(
         "Do not suggest commands unless explicitly asked. "
         "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
         "Translate metrics into natural language instead of echoing raw label/value pairs. "
+        "When providing counts or totals, use the exact numbers from the context; do not invent or truncate. "
         "Avoid bare lists unless the user asked for a list; weave numbers into sentences. "
         "Do not answer by only listing runbooks; if the question is about Atlas/Othrys, summarize the cluster first and mention docs only if useful. "
         "If the question is not about Atlas/Othrys and no cluster context is provided, answer using general knowledge and say when you are unsure. "

From 4effe8d7129aa81e818638e447512b68f2d18712 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 01:58:23 -0300
Subject: [PATCH 398/416] comms: roll atlasbot config

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 8607858a..a7fbea9c 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-88
+        checksum/atlasbot-configmap: manual-atlasbot-89
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"

From 00e4b9e9e93eb1bc307e4fdae0d9591a43156bd7 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 02:21:42 -0300
Subject: [PATCH 399/416] atlasbot: rework reasoning pipeline

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 286 +++++++++++++++++-------
 2 files changed, 210 insertions(+), 78 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index a7fbea9c..c9602c32 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-89
+        checksum/atlasbot-configmap: manual-atlasbot-90
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index c44c7da3..ffc8a5c8 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -16,7 +16,7 @@ PASSWORD = os.environ["BOT_PASS"]
 ROOM_ALIAS = "#othrys:live.bstein.dev"
 
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
-MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
+MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
 MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
 MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
 FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
@@ -2895,6 +2895,7 @@ def _open_ended_system() -> str:
         "Use ONLY the provided fact pack and recent chat as your evidence. "
         "You may draw light inferences if you label them as such. "
         "Write concise, human sentences with a helpful, calm tone (not a list). "
+        "Be willing to take a light stance; do not over-hedge. "
         "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. "
         "If the question asks for a list, embed the list inline in a sentence (comma-separated). "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
@@ -3002,20 +3003,27 @@ def _open_ended_plan(
     *,
     fact_pack: str,
     history_lines: list[str],
+    focus_tags: set[str],
+    avoid_tags: set[str],
     count: int,
     state: ThoughtState | None,
+    step: int,
     model: str | None,
 ) -> list[dict[str, Any]]:
     if state:
-        state.update("planning", step=1, note="mapping angles")
+        state.update("planning", step=step, note="mapping angles")
     count = max(1, count)
+    focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any"
+    avoid_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
     prompt_text = (
         "Analyze the question and propose up to "
         f"{count} distinct answer angles that can be supported by the fact pack. "
         "Keep them diverse (e.g., metrics, hardware, workload placement, recent changes). "
         "If the question is subjective, propose at least one angle that surfaces a standout detail. "
+        f"Prefer angles that align with these tags: {focus_hint}. "
+        f"Avoid angles that overlap these tags if possible: {avoid_hint}. "
         "Avoid repeating the same angle as the most recent response if possible. "
-        "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"priority\":1-5}]}."
+        "Return JSON: {\"angles\":[{\"focus\":\"...\",\"reason\":\"...\",\"tags\":[\"tag\"],\"priority\":1-5}]}."
     )
     context = _append_history_context(fact_pack, history_lines)
     result = _ollama_json_call(
@@ -3037,10 +3045,12 @@ def _open_ended_plan(
             priority = item.get("priority")
             if not isinstance(priority, (int, float)):
                 priority = 3
+            tags = _sanitize_focus_tags(item.get("tags") or [])
             cleaned.append(
                 {
                     "focus": focus,
                     "reason": str(item.get("reason") or ""),
+                    "tags": tags,
                     "priority": int(max(1, min(5, priority))),
                 }
             )
@@ -3131,6 +3141,35 @@ def _preferred_tags_for_prompt(prompt: str) -> set[str]:
     return tags & _ALLOWED_INSIGHT_TAGS
 
 
+_TAG_KEYWORDS: dict[str, tuple[str, ...]] = {
+    "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"),
+    "database": ("postgres", "db", "database", "connections"),
+    "pods": ("pod", "pods", "deployment", "daemonset", "job", "cron", "workload"),
+    "hardware": ("hardware", "architecture", "arch", "rpi", "raspberry", "jetson", "amd64", "arm64", "node", "nodes"),
+    "availability": ("ready", "not ready", "unready", "down", "missing"),
+    "workloads": ("workload", "service", "namespace", "app"),
+    "os": ("os", "kernel", "kubelet", "containerd", "runtime"),
+}
+
+
+def _tags_from_text(text: str) -> set[str]:
+    q = normalize_query(text)
+    if not q:
+        return set()
+    tags: set[str] = set()
+    for tag, keywords in _TAG_KEYWORDS.items():
+        if any(word in q for word in keywords):
+            tags.add(tag)
+    return tags & _ALLOWED_INSIGHT_TAGS
+
+
+def _history_focus_tags(history_lines: list[str]) -> set[str]:
+    if not history_lines:
+        return set()
+    recent = " ".join(line for line in history_lines[-6:] if isinstance(line, str))
+    return _tags_from_text(recent)
+
+
 def _open_ended_insights(
     prompt: str,
     *,
@@ -3139,10 +3178,11 @@ def _open_ended_insights(
     history_lines: list[str],
     count: int,
     state: ThoughtState | None,
+    step: int,
     model: str | None,
 ) -> list[dict[str, Any]]:
     if state:
-        state.update("analyzing", note="scouting insights")
+        state.update("analyzing", step=step, note="scouting insights")
     count = max(1, count)
     allowed_tags = ", ".join(sorted(_ALLOWED_INSIGHT_TAGS))
     prompt_text = (
@@ -3188,10 +3228,35 @@ def _open_ended_insights(
     return cleaned
 
 
+def _rank_insights(
+    insights: list[dict[str, Any]],
+    *,
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    count: int,
+) -> list[dict[str, Any]]:
+    if not insights:
+        return []
+    ranked: list[tuple[float, dict[str, Any]]] = []
+    for insight in insights:
+        relevance = _normalize_fraction(insight.get("relevance"), default=0.6)
+        novelty = _normalize_fraction(insight.get("novelty"), default=0.5)
+        tags = set(insight.get("tags") or [])
+        score = relevance * 0.65 + novelty * 0.35
+        if focus_tags and tags & focus_tags:
+            score += 0.1
+        if avoid_tags and tags & avoid_tags:
+            score -= 0.2
+        ranked.append((score, insight))
+    ranked.sort(key=lambda item: item[0], reverse=True)
+    return [item for _, item in ranked[:count]]
+
+
 def _fallback_fact_ids(
     fact_meta: dict[str, dict[str, Any]],
     *,
     focus_tags: set[str],
+    avoid_tags: set[str],
     count: int,
 ) -> list[str]:
     if not fact_meta:
@@ -3202,9 +3267,16 @@ def _fallback_fact_ids(
             for fid, meta in fact_meta.items()
             if focus_tags & set(meta.get("tags") or [])
         ]
+        if avoid_tags:
+            tagged = [fid for fid in tagged if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))]
         if tagged:
             return tagged[:count]
-    return list(fact_meta.keys())[:count]
+    all_ids = list(fact_meta.keys())
+    if avoid_tags:
+        filtered = [fid for fid in all_ids if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))]
+        if filtered:
+            return filtered[:count]
+    return all_ids[:count]
 
 
 def _open_ended_select_facts(
@@ -3214,6 +3286,7 @@ def _open_ended_select_facts(
     fact_meta: dict[str, dict[str, Any]],
     history_lines: list[str],
     focus_tags: set[str],
+    avoid_tags: set[str],
     avoid_fact_ids: list[str],
     count: int,
     subjective: bool,
@@ -3224,11 +3297,13 @@ def _open_ended_select_facts(
     if state:
         state.update("selecting facts", step=step, note="picking evidence")
     focus_hint = ", ".join(sorted(focus_tags)) if focus_tags else "any"
+    avoid_tag_hint = ", ".join(sorted(avoid_tags)) if avoid_tags else "none"
     avoid_hint = ", ".join(avoid_fact_ids) if avoid_fact_ids else "none"
     prompt_text = (
         "Select the fact IDs that best answer the question. "
         f"Pick up to {count} fact IDs. "
         f"Focus tags: {focus_hint}. "
+        f"Avoid these tags if possible: {avoid_tag_hint}. "
         f"Avoid these fact IDs: {avoid_hint}. "
         "If the question is subjective, pick standout or unusual facts; "
         "if objective, pick the minimal facts needed. "
@@ -3248,7 +3323,18 @@ def _open_ended_select_facts(
                 selected.append(fid)
             if len(selected) >= count:
                 break
-    seed = _fallback_fact_ids(fact_meta, focus_tags=focus_tags, count=count)
+    if avoid_tags:
+        selected = [
+            fid
+            for fid in selected
+            if not (avoid_tags & set(fact_meta.get(fid, {}).get("tags") or []))
+        ] or selected
+    seed = _fallback_fact_ids(
+        fact_meta,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        count=count,
+    )
     if selected:
         for fid in seed:
             if fid not in selected:
@@ -3483,7 +3569,7 @@ def _open_ended_multi(
     if mode == "fast":
         total_steps = 4
     else:
-        total_steps = 7
+        total_steps = 9
     if state:
         state.total_steps = total_steps
 
@@ -3503,41 +3589,25 @@ def _open_ended_multi(
     focus_tags = set(interpretation.get("focus_tags") or []) or _preferred_tags_for_prompt(prompt)
     if not focus_tags and subjective:
         focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+    avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
 
-    primary_ids = _open_ended_select_facts(
-        prompt,
-        fact_pack=fact_pack,
-        fact_meta=fact_meta,
-        history_lines=history_lines,
-        focus_tags=focus_tags,
-        avoid_fact_ids=[],
-        count=4 if mode == "deep" else 3,
-        subjective=subjective,
-        state=state,
-        step=2,
-        model=model,
-    )
-    alternate_ids: list[str] = []
-    if mode == "deep":
-        alternate_ids = _open_ended_select_facts(
+    if mode == "fast":
+        primary_ids = _open_ended_select_facts(
             prompt,
             fact_pack=fact_pack,
             fact_meta=fact_meta,
             history_lines=history_lines,
             focus_tags=focus_tags,
-            avoid_fact_ids=primary_ids,
-            count=4,
+            avoid_tags=avoid_tags,
+            avoid_fact_ids=[],
+            count=3,
             subjective=subjective,
             state=state,
-            step=3,
+            step=2,
             model=model,
         )
-
-    candidates: list[dict[str, Any]] = []
-    focus_label = interpretation.get("focus_label") or "primary angle"
-    step = 3 if mode == "fast" else 4
-    candidates.append(
-        _open_ended_candidate(
+        focus_label = interpretation.get("focus_label") or "primary angle"
+        candidate = _open_ended_candidate(
             prompt,
             focus=str(focus_label),
             fact_pack=fact_pack,
@@ -3546,17 +3616,65 @@ def _open_ended_multi(
             tone=str(tone),
             allow_list=allow_list,
             state=state,
-            step=step,
+            step=3,
             fact_hints=primary_ids,
             model=model,
         )
+        reply = _open_ended_synthesize(
+            prompt,
+            fact_pack=fact_pack,
+            history_lines=history_lines,
+            candidates=[candidate],
+            subjective=subjective,
+            tone=str(tone),
+            allow_list=allow_list,
+            state=state,
+            step=4,
+            model=model,
+            critique=None,
+        )
+        if state:
+            state.update("done", step=total_steps)
+        return reply
+
+    angles = _open_ended_plan(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        count=5,
+        state=state,
+        step=2,
+        model=model,
     )
-    step += 1
-    if mode == "deep" and alternate_ids:
+    if state and avoid_tags:
+        state.update("planning", step=2, note=f"avoiding {', '.join(sorted(avoid_tags))}")
+
+    insights = _open_ended_insights(
+        prompt,
+        fact_pack=fact_pack,
+        fact_meta=fact_meta,
+        history_lines=history_lines,
+        count=7,
+        state=state,
+        step=3,
+        model=model,
+    )
+    ranked_insights = _rank_insights(
+        insights,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        count=3,
+    )
+
+    candidates: list[dict[str, Any]] = []
+    step = 4
+    for insight in ranked_insights:
         candidates.append(
             _open_ended_candidate(
                 prompt,
-                focus="alternate angle",
+                focus=insight.get("summary") or "insight",
                 fact_pack=fact_pack,
                 history_lines=history_lines,
                 subjective=subjective,
@@ -3564,27 +3682,61 @@ def _open_ended_multi(
                 allow_list=allow_list,
                 state=state,
                 step=step,
-                fact_hints=alternate_ids,
+                fact_hints=insight.get("fact_ids") or [],
                 model=model,
             )
         )
         step += 1
+    if not candidates and angles:
+        for angle in angles[:2]:
+            angle_tags = set(angle.get("tags") or []) or _tags_from_text(angle.get("focus") or "")
+            fact_ids = _open_ended_select_facts(
+                prompt,
+                fact_pack=fact_pack,
+                fact_meta=fact_meta,
+                history_lines=history_lines,
+                focus_tags=angle_tags or focus_tags,
+                avoid_tags=avoid_tags,
+                avoid_fact_ids=[],
+                count=4,
+                subjective=subjective,
+                state=state,
+                step=step,
+                model=model,
+            )
+            candidates.append(
+                _open_ended_candidate(
+                    prompt,
+                    focus=angle.get("focus") or "alternate angle",
+                    fact_pack=fact_pack,
+                    history_lines=history_lines,
+                    subjective=subjective,
+                    tone=str(tone),
+                    allow_list=allow_list,
+                    state=state,
+                    step=step,
+                    fact_hints=fact_ids,
+                    model=model,
+                )
+            )
+            step += 1
+            if len(candidates) >= 2:
+                break
+
     if state:
         state.update("evaluating", step=step, note="ranking candidates")
-    selected = _select_candidates(candidates, count=1 if mode == "fast" else 2)
+    selected = _select_candidates(candidates, count=2)
+    step += 1
+    critique = _open_ended_critique(
+        prompt,
+        fact_pack=fact_pack,
+        history_lines=history_lines,
+        candidates=selected or candidates,
+        state=state,
+        step=step,
+        model=model,
+    )
     step += 1
-    critique = ""
-    if mode == "deep":
-        critique = _open_ended_critique(
-            prompt,
-            fact_pack=fact_pack,
-            history_lines=history_lines,
-            candidates=selected or candidates,
-            state=state,
-            step=step,
-            model=model,
-        )
-        step += 1
     reply = _open_ended_synthesize(
         prompt,
         fact_pack=fact_pack,
@@ -3605,8 +3757,8 @@ def _open_ended_multi(
 
 def _open_ended_total_steps(mode: str) -> int:
     if mode == "fast":
-        return 3
-    return 7
+        return 4
+    return 9
 
 
 def _fast_fact_lines(
@@ -3656,34 +3808,14 @@ def _open_ended_fast(
     history_lines: list[str],
     state: ThoughtState | None = None,
 ) -> str:
-    model = _model_for_mode("fast")
-    if state:
-        state.update("selecting", step=2, note="picking key facts")
-    subjective = _is_subjective_query(prompt)
-    focus_tags = _preferred_tags_for_prompt(prompt)
-    if not focus_tags and subjective:
-        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
-    primary_ids = _open_ended_select_facts(
+    return _open_ended_multi(
         prompt,
         fact_pack=fact_pack,
+        fact_lines=fact_lines,
         fact_meta=fact_meta,
         history_lines=history_lines,
-        focus_tags=focus_tags,
-        avoid_fact_ids=[],
-        count=3,
-        subjective=subjective,
+        mode="fast",
         state=state,
-        step=2,
-        model=model,
-    )
-    selected_lines = _fast_fact_lines(fact_lines, fact_meta, primary_ids)
-    fact_pack = _fact_pack_text(selected_lines, fact_meta)
-    return _open_ended_fast_single(
-        prompt,
-        fact_pack=fact_pack,
-        history_lines=history_lines,
-        state=state,
-        model=model,
     )
 
 
@@ -3846,7 +3978,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         if cluster_query:
             context = build_context(
                 cleaned,
-                allow_tools=False,
+                allow_tools=True,
                 targets=[],
                 inventory=inventory,
                 snapshot=snapshot,
@@ -3860,7 +3992,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                 workloads=workloads,
                 history_lines=history_lines,
                 mode=mode,
-                allow_tools=False,
+                allow_tools=True,
                 state=None,
             )
         else:

From 7a7d96ba2127d10d1c9dfccd10d50793218a10d9 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 02:43:24 -0300
Subject: [PATCH 400/416] comms: tune atlasbot quick model

---
 services/comms/atlasbot-deployment.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index c9602c32..d570fd9a 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-90
+        checksum/atlasbot-configmap: manual-atlasbot-91
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -84,7 +84,7 @@ spec:
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct
             - name: ATLASBOT_MODEL_FAST
-              value: qwen2.5:14b-instruct
+              value: qwen2.5:7b-instruct-q4_0
             - name: ATLASBOT_MODEL_DEEP
               value: qwen2.5:14b-instruct
             - name: OLLAMA_FALLBACK_MODEL

From e1505873d39b33a187311ed962ded802276d0c9a Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 02:53:43 -0300
Subject: [PATCH 401/416] atlasbot: streamline quick answers

---
 services/comms/scripts/atlasbot/bot.py | 120 ++++++++++---------------
 1 file changed, 47 insertions(+), 73 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index ffc8a5c8..6f18b9ea 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3562,14 +3562,10 @@ def _open_ended_multi(
     fact_lines: list[str],
     fact_meta: dict[str, dict[str, Any]],
     history_lines: list[str],
-    mode: str,
     state: ThoughtState | None = None,
 ) -> str:
-    model = _model_for_mode(mode)
-    if mode == "fast":
-        total_steps = 4
-    else:
-        total_steps = 9
+    model = _model_for_mode("deep")
+    total_steps = _open_ended_total_steps("deep")
     if state:
         state.total_steps = total_steps
 
@@ -3591,52 +3587,6 @@ def _open_ended_multi(
         focus_tags = set(_ALLOWED_INSIGHT_TAGS)
     avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
 
-    if mode == "fast":
-        primary_ids = _open_ended_select_facts(
-            prompt,
-            fact_pack=fact_pack,
-            fact_meta=fact_meta,
-            history_lines=history_lines,
-            focus_tags=focus_tags,
-            avoid_tags=avoid_tags,
-            avoid_fact_ids=[],
-            count=3,
-            subjective=subjective,
-            state=state,
-            step=2,
-            model=model,
-        )
-        focus_label = interpretation.get("focus_label") or "primary angle"
-        candidate = _open_ended_candidate(
-            prompt,
-            focus=str(focus_label),
-            fact_pack=fact_pack,
-            history_lines=history_lines,
-            subjective=subjective,
-            tone=str(tone),
-            allow_list=allow_list,
-            state=state,
-            step=3,
-            fact_hints=primary_ids,
-            model=model,
-        )
-        reply = _open_ended_synthesize(
-            prompt,
-            fact_pack=fact_pack,
-            history_lines=history_lines,
-            candidates=[candidate],
-            subjective=subjective,
-            tone=str(tone),
-            allow_list=allow_list,
-            state=state,
-            step=4,
-            model=model,
-            critique=None,
-        )
-        if state:
-            state.update("done", step=total_steps)
-        return reply
-
     angles = _open_ended_plan(
         prompt,
         fact_pack=fact_pack,
@@ -3757,41 +3707,52 @@ def _open_ended_multi(
 
 def _open_ended_total_steps(mode: str) -> int:
     if mode == "fast":
-        return 4
+        return 2
     return 9
 
 
 def _fast_fact_lines(
     fact_lines: list[str],
     fact_meta: dict[str, dict[str, Any]],
-    fact_ids: list[str],
+    *,
+    focus_tags: set[str],
+    avoid_tags: set[str],
+    limit: int = 10,
 ) -> list[str]:
-    if not fact_ids:
-        return fact_lines
-    selected = [
-        line
-        for line in fact_lines
-        if fact_meta.get(line, {}).get("id") in set(fact_ids)
-    ]
-    return selected or fact_lines
+    if not fact_lines:
+        return []
+    selected: list[str] = []
+    for idx, line in enumerate(fact_lines):
+        fid = f"F{idx + 1}"
+        tags = set(fact_meta.get(fid, {}).get("tags") or [])
+        if focus_tags and not (focus_tags & tags):
+            continue
+        if avoid_tags and (avoid_tags & tags):
+            continue
+        selected.append(line)
+        if len(selected) >= limit:
+            break
+    if selected:
+        return selected
+    trimmed = fact_lines[:limit]
+    return trimmed or fact_lines
 
 
 def _open_ended_fast_single(
     prompt: str,
     *,
-    fact_pack: str,
-    history_lines: list[str],
+    context: str,
     state: ThoughtState | None = None,
     model: str,
 ) -> str:
     if state:
-        state.update("drafting", step=2, note="summarizing")
-    context = fact_pack
+        state.update("drafting", step=1, note="summarizing")
     reply = _ollama_call(
         ("atlasbot_fast", "atlasbot_fast"),
         prompt,
         context=context,
         use_history=False,
+        system_override=_open_ended_system(),
         model=model,
     )
     if state:
@@ -3808,14 +3769,28 @@ def _open_ended_fast(
     history_lines: list[str],
     state: ThoughtState | None = None,
 ) -> str:
-    return _open_ended_multi(
+    model = _model_for_mode("fast")
+    subjective = _is_subjective_query(prompt)
+    focus_tags = _preferred_tags_for_prompt(prompt)
+    if not focus_tags and subjective:
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+    avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
+    selected_lines = _fast_fact_lines(
+        fact_lines,
+        fact_meta,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+    )
+    selected_meta = _fact_pack_meta(selected_lines)
+    selected_pack = _fact_pack_text(selected_lines, selected_meta)
+    context = _append_history_context(selected_pack, history_lines)
+    if state:
+        state.total_steps = _open_ended_total_steps("fast")
+    return _open_ended_fast_single(
         prompt,
-        fact_pack=fact_pack,
-        fact_lines=fact_lines,
-        fact_meta=fact_meta,
-        history_lines=history_lines,
-        mode="fast",
+        context=context,
         state=state,
+        model=model,
     )
 
 
@@ -3834,7 +3809,6 @@ def _open_ended_deep(
         fact_lines=fact_lines,
         fact_meta=fact_meta,
         history_lines=history_lines,
-        mode="deep",
         state=state,
     )
 

From b257e4fc10f79affe10ce6547f6b2e02f2c465c0 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:09:34 -0300
Subject: [PATCH 402/416] atlasbot: enrich fact pack summaries

---
 services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 6f18b9ea..96765b13 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -1037,6 +1037,11 @@ def facts_context(
         nodes_list = by_hardware.get(key) or []
         if nodes_list:
             lines.append(f"- {key}: {', '.join(nodes_list)}")
+    if by_hardware:
+        counts = {key: len(nodes_list) for key, nodes_list in by_hardware.items() if nodes_list}
+        if counts:
+            parts = [f"{key}={count}" for key, count in sorted(counts.items())]
+            lines.append(f"- nodes_by_hardware_count: {', '.join(parts)}")
     non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", [])))
     if non_rpi:
         lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}")
@@ -1096,6 +1101,25 @@ def facts_context(
         value = metrics.get(key)
         if value is not None:
             lines.append(f"- {key}: {value}")
+    if workloads:
+        ns_counts: dict[str, int] = collections.defaultdict(int)
+        for entry in workloads:
+            if not isinstance(entry, dict):
+                continue
+            ns = entry.get("namespace") or ""
+            pods = entry.get("pods_running")
+            if pods is None:
+                pods = entry.get("pods_total")
+            try:
+                pods_val = int(pods)
+            except (TypeError, ValueError):
+                pods_val = 0
+            if ns:
+                ns_counts[ns] += pods_val
+        if ns_counts:
+            top_ns = sorted(ns_counts.items(), key=lambda item: item[1], reverse=True)[:5]
+            parts = [f"{ns}={count}" for ns, count in top_ns]
+            lines.append(f"- pods_by_namespace: {', '.join(parts)}")
 
     top_restarts = metrics.get("top_restarts_1h") if isinstance(metrics.get("top_restarts_1h"), list) else []
     if top_restarts:
@@ -2725,6 +2749,8 @@ def _fact_line_tags(line: str) -> set[str]:
         tags.add("database")
     if "pods_" in text or "pod phases" in text or "restarts" in text:
         tags.add("pods")
+    if "namespace" in text:
+        tags.add("workloads")
     if "workloads" in text or "primary_node" in text or "workload_" in text:
         tags.add("workloads")
     if "node_details" in text:
@@ -2900,6 +2926,8 @@ def _open_ended_system() -> str:
         "If the question asks for a list, embed the list inline in a sentence (comma-separated). "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
+        "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. "
+        "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. "
         "Do not invent numbers or facts. "
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)."
     )

From c2916e60c15cf90f3128cf7ce00cc4cf83314d77 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:14:12 -0300
Subject: [PATCH 403/416] atlasbot: prioritize fact selection for quick answers

---
 services/comms/scripts/atlasbot/bot.py | 56 +++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 96765b13..43f578b0 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3169,6 +3169,23 @@ def _preferred_tags_for_prompt(prompt: str) -> set[str]:
     return tags & _ALLOWED_INSIGHT_TAGS
 
 
+def _primary_tags_for_prompt(prompt: str) -> set[str]:
+    q = normalize_query(prompt)
+    if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")):
+        return {"utilization"}
+    if any(word in q for word in ("postgres", "database", "db", "connections")):
+        return {"database"}
+    if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")):
+        return {"pods"}
+    if any(word in q for word in ("workload", "service", "namespace")):
+        return {"workloads"}
+    if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")):
+        return {"availability"}
+    if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")):
+        return {"hardware", "inventory", "architecture"}
+    return set()
+
+
 _TAG_KEYWORDS: dict[str, tuple[str, ...]] = {
     "utilization": ("cpu", "ram", "memory", "net", "network", "io", "disk", "usage", "utilization", "hottest", "busy"),
     "database": ("postgres", "db", "database", "connections"),
@@ -3745,25 +3762,43 @@ def _fast_fact_lines(
     *,
     focus_tags: set[str],
     avoid_tags: set[str],
+    primary_tags: set[str] | None = None,
     limit: int = 10,
 ) -> list[str]:
     if not fact_lines:
         return []
-    selected: list[str] = []
+    primary_tags = primary_tags or set()
+    scored: list[tuple[int, int, str]] = []
     for idx, line in enumerate(fact_lines):
         fid = f"F{idx + 1}"
         tags = set(fact_meta.get(fid, {}).get("tags") or [])
-        if focus_tags and not (focus_tags & tags):
-            continue
         if avoid_tags and (avoid_tags & tags):
             continue
-        selected.append(line)
+        score = 0
+        if primary_tags:
+            score += 4 * len(tags & primary_tags)
+        if focus_tags:
+            score += 2 * len(tags & focus_tags)
+        scored.append((score, idx, line))
+    scored.sort(key=lambda item: (-item[0], item[1]))
+    selected: list[str] = []
+    for score, _, line in scored:
+        if score <= 0 and selected:
+            break
+        if score > 0:
+            selected.append(line)
         if len(selected) >= limit:
             break
-    if selected:
-        return selected
-    trimmed = fact_lines[:limit]
-    return trimmed or fact_lines
+    if not selected:
+        selected = [line for _, _, line in scored[:limit]]
+    elif len(selected) < limit:
+        for _, _, line in scored:
+            if line in selected:
+                continue
+            selected.append(line)
+            if len(selected) >= limit:
+                break
+    return selected
 
 
 def _open_ended_fast_single(
@@ -3799,6 +3834,7 @@ def _open_ended_fast(
 ) -> str:
     model = _model_for_mode("fast")
     subjective = _is_subjective_query(prompt)
+    primary_tags = _primary_tags_for_prompt(prompt)
     focus_tags = _preferred_tags_for_prompt(prompt)
     if not focus_tags and subjective:
         focus_tags = set(_ALLOWED_INSIGHT_TAGS)
@@ -3808,15 +3844,15 @@ def _open_ended_fast(
         fact_meta,
         focus_tags=focus_tags,
         avoid_tags=avoid_tags,
+        primary_tags=primary_tags,
     )
     selected_meta = _fact_pack_meta(selected_lines)
     selected_pack = _fact_pack_text(selected_lines, selected_meta)
-    context = _append_history_context(selected_pack, history_lines)
     if state:
         state.total_steps = _open_ended_total_steps("fast")
     return _open_ended_fast_single(
         prompt,
-        context=context,
+        context=selected_pack,
         state=state,
         model=model,
     )

From 483c3566f4d1497bbc87bfa4bb67718c7d9e4d15 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:17:46 -0300
Subject: [PATCH 404/416] atlasbot: enforce fast answer body

---
 services/comms/scripts/atlasbot/bot.py | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 43f578b0..7d47423f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2926,6 +2926,7 @@ def _open_ended_system() -> str:
         "If the question asks for a list, embed the list inline in a sentence (comma-separated). "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
+        "Always include at least one substantive answer sentence before the score lines. "
         "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. "
         "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. "
         "Do not invent numbers or facts. "
@@ -3801,6 +3802,24 @@ def _fast_fact_lines(
     return selected
 
 
+def _has_body_lines(answer: str) -> bool:
+    lines = [line.strip() for line in (answer or "").splitlines() if line.strip()]
+    for line in lines:
+        lowered = line.lower()
+        if lowered.startswith("confidence"):
+            continue
+        if lowered.startswith("relevance"):
+            continue
+        if lowered.startswith("satisfaction"):
+            continue
+        if lowered.startswith("hallucinationrisk"):
+            continue
+        if lowered.startswith("hallucination risk"):
+            continue
+        return True
+    return False
+
+
 def _open_ended_fast_single(
     prompt: str,
     *,
@@ -3818,6 +3837,15 @@ def _open_ended_fast_single(
         system_override=_open_ended_system(),
         model=model,
     )
+    if not _has_body_lines(reply):
+        reply = _ollama_call(
+            ("atlasbot_fast", "atlasbot_fast"),
+            prompt + " Provide one clear sentence before the score lines.",
+            context=context,
+            use_history=False,
+            system_override=_open_ended_system(),
+            model=model,
+        )
     if state:
         state.update("done", step=_open_ended_total_steps("fast"))
     return _ensure_scores(reply)

From c8630ddae8231bfb8436802e972bfa7168983eb1 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:20:28 -0300
Subject: [PATCH 405/416] atlasbot: fix tag detection for workload queries

---
 services/comms/scripts/atlasbot/bot.py | 29 ++++++++++++++------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 7d47423f..b73d3f3d 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3154,35 +3154,37 @@ def _open_ended_interpret(
 
 def _preferred_tags_for_prompt(prompt: str) -> set[str]:
     q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
     tags: set[str] = set()
-    if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")):
+    if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}:
         tags.add("utilization")
-    if any(word in q for word in ("postgres", "database", "db", "connections")):
+    if tokens & {"postgres", "database", "db", "connections"}:
         tags.add("database")
-    if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")):
+    if tokens & {"pod", "pods", "deployment", "job", "cronjob"}:
         tags.add("pods")
-    if any(word in q for word in ("workload", "service", "namespace")):
+    if tokens & {"workload", "service", "namespace"}:
         tags.add("workloads")
-    if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")):
+    if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q:
         tags.add("availability")
-    if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")):
+    if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}:
         tags.update({"hardware", "inventory", "architecture"})
     return tags & _ALLOWED_INSIGHT_TAGS
 
 
 def _primary_tags_for_prompt(prompt: str) -> set[str]:
     q = normalize_query(prompt)
-    if any(word in q for word in ("cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load")):
+    tokens = set(_tokens(prompt))
+    if tokens & {"cpu", "ram", "memory", "net", "network", "io", "disk", "hottest", "busy", "usage", "utilization", "load"}:
         return {"utilization"}
-    if any(word in q for word in ("postgres", "database", "db", "connections")):
+    if tokens & {"postgres", "database", "db", "connections"}:
         return {"database"}
-    if any(word in q for word in ("pod", "pods", "deployment", "job", "cronjob")):
+    if tokens & {"pod", "pods", "deployment", "job", "cronjob"}:
         return {"pods"}
-    if any(word in q for word in ("workload", "service", "namespace")):
+    if tokens & {"workload", "service", "namespace"}:
         return {"workloads"}
-    if any(word in q for word in ("ready", "not ready", "down", "unreachable", "availability")):
+    if tokens & {"ready", "down", "unreachable", "availability"} or "not ready" in q:
         return {"availability"}
-    if any(word in q for word in ("node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane")):
+    if tokens & {"node", "nodes", "hardware", "arch", "architecture", "rpi", "jetson", "amd64", "arm64", "worker", "control-plane"}:
         return {"hardware", "inventory", "architecture"}
     return set()
 
@@ -3202,9 +3204,10 @@ def _tags_from_text(text: str) -> set[str]:
     q = normalize_query(text)
     if not q:
         return set()
+    tokens = set(_tokens(text))
     tags: set[str] = set()
     for tag, keywords in _TAG_KEYWORDS.items():
-        if any(word in q for word in keywords):
+        if any(word in tokens for word in keywords):
             tags.add(tag)
     return tags & _ALLOWED_INSIGHT_TAGS
 

From 5bc90929be23c42e8f045bc177ec28c16230b7a4 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:23:54 -0300
Subject: [PATCH 406/416] comms: use 14b model for atlasbot quick

---
 services/comms/atlasbot-deployment.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index d570fd9a..6fbd3271 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-91
+        checksum/atlasbot-configmap: manual-atlasbot-92
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@@ -84,7 +84,7 @@ spec:
             - name: OLLAMA_MODEL
               value: qwen2.5:14b-instruct
             - name: ATLASBOT_MODEL_FAST
-              value: qwen2.5:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct-q4_0
             - name: ATLASBOT_MODEL_DEEP
               value: qwen2.5:14b-instruct
             - name: OLLAMA_FALLBACK_MODEL

From 48b46972abe1b185d912c9101c99d359d9ef046b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:29:21 -0300
Subject: [PATCH 407/416] atlasbot: add fact-pack fallback for fast

---
 services/comms/scripts/atlasbot/bot.py | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index b73d3f3d..4fa67d40 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3823,6 +3823,37 @@ def _has_body_lines(answer: str) -> bool:
     return False
 
 
+def _fallback_fact_answer(prompt: str, context: str) -> str:
+    facts: list[str] = []
+    for line in (context or "").splitlines():
+        trimmed = line.strip()
+        if not trimmed.startswith("F"):
+            continue
+        if ":" not in trimmed:
+            continue
+        fact = trimmed.split(":", 1)[1].strip()
+        if fact.startswith("-"):
+            fact = fact.lstrip("-").strip()
+        if fact:
+            facts.append(fact)
+    if not facts:
+        return ""
+    tokens = set(_tokens(prompt))
+    best_fact = ""
+    best_score = -1
+    for fact in facts:
+        score = len(tokens & set(_tokens(fact)))
+        if score > best_score:
+            best_score = score
+            best_fact = fact
+    if best_score <= 0:
+        return ""
+    sentence = f"Based on the snapshot, {best_fact}"
+    if not sentence.endswith((".", "!", "?")):
+        sentence += "."
+    return sentence
+
+
 def _open_ended_fast_single(
     prompt: str,
     *,
@@ -3849,6 +3880,10 @@ def _open_ended_fast_single(
             system_override=_open_ended_system(),
             model=model,
         )
+    if not _has_body_lines(reply):
+        fallback = _fallback_fact_answer(prompt, context)
+        if fallback:
+            reply = fallback
     if state:
         state.update("done", step=_open_ended_total_steps("fast"))
     return _ensure_scores(reply)

From 9be79f07cd6d47e3661b0205d0e46894a807b868 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:32:17 -0300
Subject: [PATCH 408/416] atlasbot: prefer fact fallback for quantitative
 prompts

---
 services/comms/scripts/atlasbot/bot.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 4fa67d40..8806d2aa 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3854,6 +3854,18 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
     return sentence
 
 
+def _is_quantitative_prompt(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    if not q:
+        return False
+    tokens = set(_tokens(prompt))
+    if "how many" in q or "count" in tokens or "total" in tokens:
+        return True
+    if tokens & {"highest", "lowest", "hottest", "most", "least"}:
+        return True
+    return False
+
+
 def _open_ended_fast_single(
     prompt: str,
     *,
@@ -3880,10 +3892,9 @@ def _open_ended_fast_single(
             system_override=_open_ended_system(),
             model=model,
         )
-    if not _has_body_lines(reply):
-        fallback = _fallback_fact_answer(prompt, context)
-        if fallback:
-            reply = fallback
+    fallback = _fallback_fact_answer(prompt, context)
+    if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
+        reply = fallback
     if state:
         state.update("done", step=_open_ended_total_steps("fast"))
     return _ensure_scores(reply)

From 98dc7284e7a7c8b7574451916be2adb1638435c8 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:35:02 -0300
Subject: [PATCH 409/416] atlasbot: fix fallback fact parsing

---
 services/comms/scripts/atlasbot/bot.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 8806d2aa..e0f84175 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3829,9 +3829,12 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
         trimmed = line.strip()
         if not trimmed.startswith("F"):
             continue
-        if ":" not in trimmed:
+        match = re.match(r"^F\\d+.*?\\]:\\s*(.*)$", trimmed)
+        if not match:
+            match = re.match(r"^F\\d+:\\s*(.*)$", trimmed)
+        if not match:
             continue
-        fact = trimmed.split(":", 1)[1].strip()
+        fact = match.group(1).strip()
         if fact.startswith("-"):
             fact = fact.lstrip("-").strip()
         if fact:

From 55a05c757f94964cda6a07cda210ecbd27aaeeb6 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:46:06 -0300
Subject: [PATCH 410/416] atlasbot: refine fast fact selection and prompts

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 56 +++++++++++++++++++++----
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 6fbd3271..f007942d 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-92
+        checksum/atlasbot-configmap: manual-atlasbot-93
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index e0f84175..5ce19845 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -253,11 +253,13 @@ def normalize_query(text: str) -> str:
     cleaned = (text or "").lower()
     for ch in _DASH_CHARS:
         cleaned = cleaned.replace(ch, "-")
+    cleaned = cleaned.replace("_", " ")
     cleaned = re.sub(r"\s+", " ", cleaned).strip()
     return cleaned
 
 def _tokens(text: str) -> list[str]:
-    toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
+    cleaned = re.sub(r"[\\_/]", " ", text or "")
+    toks = [t.lower() for t in TOKEN_RE.findall(cleaned)]
     return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
 
 
@@ -2730,6 +2732,18 @@ _ALLOWED_INSIGHT_TAGS = {
 
 _DYNAMIC_TAGS = {"availability", "database", "pods", "utilization", "workloads"}
 _INVENTORY_TAGS = {"hardware", "architecture", "inventory", "workers", "node_detail", "os"}
+_SUBJECTIVE_TAG_PRIORITY = (
+    "utilization",
+    "database",
+    "pods",
+    "workloads",
+    "availability",
+    "hardware",
+    "inventory",
+    "architecture",
+    "node_detail",
+    "os",
+)
 
 
 def _fact_line_tags(line: str) -> set[str]:
@@ -2922,7 +2936,8 @@ def _open_ended_system() -> str:
         "You may draw light inferences if you label them as such. "
         "Write concise, human sentences with a helpful, calm tone (not a list). "
         "Be willing to take a light stance; do not over-hedge. "
-        "If the question is subjective (cool/interesting/unconventional), pick a standout fact and explain why it stands out. "
+        "If the question is subjective (cool/interesting/unconventional), pick a standout fact, explain why it stands out, "
+        "and use 2-3 sentences. "
         "If the question asks for a list, embed the list inline in a sentence (comma-separated). "
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
@@ -3773,6 +3788,8 @@ def _fast_fact_lines(
         return []
     primary_tags = primary_tags or set()
     scored: list[tuple[int, int, str]] = []
+    priority_map = {tag: idx for idx, tag in enumerate(_SUBJECTIVE_TAG_PRIORITY)}
+    use_priority = not primary_tags and focus_tags == _ALLOWED_INSIGHT_TAGS
     for idx, line in enumerate(fact_lines):
         fid = f"F{idx + 1}"
         tags = set(fact_meta.get(fid, {}).get("tags") or [])
@@ -3783,6 +3800,12 @@ def _fast_fact_lines(
             score += 4 * len(tags & primary_tags)
         if focus_tags:
             score += 2 * len(tags & focus_tags)
+        if use_priority and tags:
+            bonus = 0
+            for tag in tags:
+                if tag in priority_map:
+                    bonus = max(bonus, len(priority_map) - priority_map[tag])
+            score += bonus
         scored.append((score, idx, line))
     scored.sort(key=lambda item: (-item[0], item[1]))
     selected: list[str] = []
@@ -3845,13 +3868,27 @@ def _fallback_fact_answer(prompt: str, context: str) -> str:
     best_fact = ""
     best_score = -1
     for fact in facts:
-        score = len(tokens & set(_tokens(fact)))
+        key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", fact)
+        if not key_match:
+            key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", fact)
+        key_tokens: set[str] = set()
+        if key_match:
+            key_tokens = set(_tokens(key_match.group(1)))
+        score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens)
         if score > best_score:
             best_score = score
             best_fact = fact
     if best_score <= 0:
         return ""
-    sentence = f"Based on the snapshot, {best_fact}"
+    key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", best_fact)
+    if not key_match:
+        key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", best_fact)
+    if key_match:
+        key = key_match.group(1).strip().replace("_", " ")
+        val = key_match.group(2).strip()
+        sentence = f"{key.capitalize()} is {val}"
+    else:
+        sentence = f"Based on the snapshot, {best_fact}"
     if not sentence.endswith((".", "!", "?")):
         sentence += "."
     return sentence
@@ -3873,15 +3910,17 @@ def _open_ended_fast_single(
     prompt: str,
     *,
     context: str,
+    history_lines: list[str] | None = None,
     state: ThoughtState | None = None,
     model: str,
 ) -> str:
     if state:
         state.update("drafting", step=1, note="summarizing")
+    working_context = _append_history_context(context, history_lines or []) if history_lines else context
     reply = _ollama_call(
         ("atlasbot_fast", "atlasbot_fast"),
         prompt,
-        context=context,
+        context=working_context,
         use_history=False,
         system_override=_open_ended_system(),
         model=model,
@@ -3890,7 +3929,7 @@ def _open_ended_fast_single(
         reply = _ollama_call(
             ("atlasbot_fast", "atlasbot_fast"),
             prompt + " Provide one clear sentence before the score lines.",
-            context=context,
+            context=working_context,
             use_history=False,
             system_override=_open_ended_system(),
             model=model,
@@ -3933,6 +3972,7 @@ def _open_ended_fast(
     return _open_ended_fast_single(
         prompt,
         context=selected_pack,
+        history_lines=history_lines,
         state=state,
         model=model,
     )
@@ -4089,7 +4129,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
         cleaned_q = normalize_query(cleaned)
         cluster_affinity = _is_cluster_query(cleaned, inventory=inventory, workloads=workloads)
         subjective = _is_subjective_query(cleaned)
-        followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS)
+        followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS)
         contextual = history_cluster and (followup or followup_affinity)
         cluster_query = cluster_affinity or contextual
         context = ""
@@ -4633,7 +4673,7 @@ def sync_loop(token: str, room_id: str):
                 cleaned_q = normalize_query(cleaned_body)
                 cluster_affinity = _is_cluster_query(cleaned_body, inventory=inventory, workloads=workloads)
                 subjective = _is_subjective_query(cleaned_body)
-                followup_affinity = subjective or any(word in cleaned_q for word in METRIC_HINT_WORDS)
+                followup_affinity = any(word in cleaned_q for word in METRIC_HINT_WORDS)
                 contextual = history_cluster and (followup or followup_affinity)
                 cluster_query = cluster_affinity or contextual
                 context = ""

From d68252d51bfa6ce72d7c0ac3f723cf2139b94c81 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:51:37 -0300
Subject: [PATCH 411/416] atlasbot: expand fast context for quantitative
 prompts

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index f007942d..7856eed4 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-93
+        checksum/atlasbot-configmap: manual-atlasbot-94
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 5ce19845..81212ff3 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3967,6 +3967,8 @@ def _open_ended_fast(
     )
     selected_meta = _fact_pack_meta(selected_lines)
     selected_pack = _fact_pack_text(selected_lines, selected_meta)
+    if _is_quantitative_prompt(prompt) or not selected_lines:
+        selected_pack = fact_pack
     if state:
         state.total_steps = _open_ended_total_steps("fast")
     return _open_ended_fast_single(

From 9144953519c5f2538976a8706e15ff9a0ac7d0fe Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 03:56:26 -0300
Subject: [PATCH 412/416] atlasbot: improve fast fallback and usage filtering

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 36 ++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7856eed4..79946181 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-94
+        checksum/atlasbot-configmap: manual-atlasbot-95
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 81212ff3..357941bd 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -923,7 +923,7 @@ def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
         grouped[(node.get("arch") or "unknown")].append(node["name"])
     return {k: sorted(v) for k, v in grouped.items()}
 
-def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]:
+def _node_usage_table(metrics: dict[str, Any], *, allowed_nodes: set[str] | None = None) -> list[dict[str, Any]]:
     usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
     per_node: dict[str, dict[str, Any]] = {}
     for metric_name, entries in usage.items() if isinstance(usage, dict) else []:
@@ -935,6 +935,8 @@ def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]:
             node = entry.get("node")
             if not isinstance(node, str) or not node:
                 continue
+            if allowed_nodes and node not in allowed_nodes:
+                continue
             per_node.setdefault(node, {})[metric_name] = entry.get("value")
     return [{"node": node, **vals} for node, vals in sorted(per_node.items())]
 
@@ -1139,7 +1141,8 @@ def facts_context(
         if items:
             lines.append(f"- top_restarts_1h: {', '.join(items)}")
 
-    usage_table = _node_usage_table(metrics)
+    allowed_nodes = {node.get("name") for node in inv if isinstance(node, dict) and node.get("name")}
+    usage_table = _node_usage_table(metrics, allowed_nodes=allowed_nodes or None)
     if usage_table:
         lines.append("- node_usage (cpu/ram/net/io):")
         for entry in usage_table:
@@ -3906,6 +3909,31 @@ def _is_quantitative_prompt(prompt: str) -> bool:
     return False
 
 
+def _is_list_prompt(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    if not q:
+        return False
+    if any(phrase in q for phrase in ("list", "names", "name", "show")):
+        return True
+    if any(phrase in q for phrase in ("which nodes", "what nodes", "what are the nodes")):
+        return True
+    return False
+
+
+def _needs_full_fact_pack(prompt: str) -> bool:
+    q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
+    if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt):
+        return True
+    if tokens & {"workload", "pods", "namespace"}:
+        return True
+    if _NAME_INDEX and tokens & _NAME_INDEX:
+        return True
+    if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")):
+        return True
+    return False
+
+
 def _open_ended_fast_single(
     prompt: str,
     *,
@@ -3937,6 +3965,8 @@ def _open_ended_fast_single(
     fallback = _fallback_fact_answer(prompt, context)
     if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
         reply = fallback
+    if not _has_body_lines(reply):
+        reply = "I don't have enough data in the current snapshot to answer that."
     if state:
         state.update("done", step=_open_ended_total_steps("fast"))
     return _ensure_scores(reply)
@@ -3967,7 +3997,7 @@ def _open_ended_fast(
     )
     selected_meta = _fact_pack_meta(selected_lines)
     selected_pack = _fact_pack_text(selected_lines, selected_meta)
-    if _is_quantitative_prompt(prompt) or not selected_lines:
+    if _needs_full_fact_pack(prompt) or not selected_lines:
         selected_pack = fact_pack
     if state:
         state.total_steps = _open_ended_total_steps("fast")

From 17d144dcb6e84bda9fa3e23d68c1ad05c46dbb6b Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 04:00:13 -0300
Subject: [PATCH 413/416] atlasbot: clean fact labels and non-cluster
 confidence

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 79946181..58a55641 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-95
+        checksum/atlasbot-configmap: manual-atlasbot-96
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 357941bd..59a8c2d4 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2945,6 +2945,7 @@ def _open_ended_system() -> str:
         "If the question is ambiguous, pick a reasonable interpretation and state it briefly. "
         "Avoid repeating the exact same observation as the last response if possible; vary across metrics, workload, or hardware details. "
         "Always include at least one substantive answer sentence before the score lines. "
+        "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. "
         "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. "
         "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. "
         "Do not invent numbers or facts. "
@@ -4091,6 +4092,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
         system_override=system,
         model=model,
     )
+    reply = re.sub(r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", "", reply, flags=re.IGNORECASE).strip()
     return _ensure_scores(reply)
 
 

From 9d8b48fbf571052d0a9cd596aa030d1c77904910 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 04:06:24 -0300
Subject: [PATCH 414/416] atlasbot: expand full-pack triggers and strip inline
 confidence

---
 services/comms/atlasbot-deployment.yaml |  2 +-
 services/comms/scripts/atlasbot/bot.py  | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 58a55641..7001190a 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-96
+        checksum/atlasbot-configmap: manual-atlasbot-97
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 59a8c2d4..6f3581f9 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -3030,7 +3030,14 @@ def _ensure_scores(answer: str) -> str:
         ):
             _record_score("hallucinationrisk", _extract_value(cleaned))
             continue
-        body_lines.append(line)
+        cleaned_body = re.sub(
+            r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*",
+            "",
+            line,
+            flags=re.IGNORECASE,
+        ).strip()
+        if cleaned_body:
+            body_lines.append(cleaned_body)
 
     confidence = score_map.get("confidence") or "medium"
     relevance = score_map.get("relevance") or "70"
@@ -3926,7 +3933,7 @@ def _needs_full_fact_pack(prompt: str) -> bool:
     tokens = set(_tokens(prompt))
     if _is_quantitative_prompt(prompt) or _is_list_prompt(prompt):
         return True
-    if tokens & {"workload", "pods", "namespace"}:
+    if tokens & {"workload", "pods", "namespace", "worker", "workers"}:
         return True
     if _NAME_INDEX and tokens & _NAME_INDEX:
         return True

From eb074d98583d322446adeb5b6539dac7d76e4c91 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 04:10:31 -0300
Subject: [PATCH 415/416] atlasbot: favor factual fallback in fast mode

---
 services/comms/atlasbot-deployment.yaml | 2 +-
 services/comms/scripts/atlasbot/bot.py  | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 7001190a..187cd6c1 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-97
+        checksum/atlasbot-configmap: manual-atlasbot-98
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 6f3581f9..7fcc066f 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -2948,6 +2948,7 @@ def _open_ended_system() -> str:
         "Do not mention fact IDs or internal labels (e.g., F1/F2) in your response. "
         "When the fact pack includes hottest_cpu/ram/net/io lines, use them to answer hottest/busiest node questions. "
         "When the fact pack includes postgres_hottest_db, use it for questions about the busiest database. "
+        "Do not convert counts into percentages or claim 100% unless a fact explicitly states a percentage. "
         "Do not invent numbers or facts. "
         "End with lines: Confidence, Relevance (0-100), Satisfaction (0-100), HallucinationRisk (low|medium|high)."
     )
@@ -4007,6 +4008,10 @@ def _open_ended_fast(
     selected_pack = _fact_pack_text(selected_lines, selected_meta)
     if _needs_full_fact_pack(prompt) or not selected_lines:
         selected_pack = fact_pack
+    if not subjective and _needs_full_fact_pack(prompt):
+        fallback = _fallback_fact_answer(prompt, fact_pack)
+        if fallback:
+            return _ensure_scores(fallback)
     if state:
         state.total_steps = _open_ended_total_steps("fast")
     return _open_ended_fast_single(

From f75040bacacf2c79c3d938d4db08b23c65ae2fe1 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Wed, 28 Jan 2026 11:02:10 -0300
Subject: [PATCH 416/416] atlasbot: improve fact parsing and fallback answers

---
 services/comms/atlasbot-deployment.yaml |   2 +-
 services/comms/scripts/atlasbot/bot.py  | 227 ++++++++++++++++++++++--
 2 files changed, 210 insertions(+), 19 deletions(-)

diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml
index 187cd6c1..b65aef08 100644
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@@ -16,7 +16,7 @@ spec:
       labels:
         app: atlasbot
       annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-98
+        checksum/atlasbot-configmap: manual-atlasbot-101
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "comms"
         vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py
index 7fcc066f..be256c0e 100644
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@@ -260,7 +260,24 @@ def normalize_query(text: str) -> str:
 def _tokens(text: str) -> list[str]:
     cleaned = re.sub(r"[\\_/]", " ", text or "")
     toks = [t.lower() for t in TOKEN_RE.findall(cleaned)]
-    return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
+    expanded: list[str] = []
+    synonyms = {
+        "network": "net",
+        "net": "network",
+        "memory": "ram",
+        "ram": "memory",
+        "i/o": "io",
+    }
+    for token in toks:
+        expanded.append(token)
+        if "-" in token:
+            expanded.extend(part for part in token.split("-") if part)
+    for token in list(expanded):
+        if token in synonyms:
+            expanded.append(synonyms[token])
+        if token.endswith("s") and len(token) > 3:
+            expanded.append(token.rstrip("s"))
+    return [t for t in expanded if t not in STOPWORDS and len(t) >= 2]
 
 
 def _ensure_confidence(text: str) -> str:
@@ -1077,10 +1094,16 @@ def facts_context(
             lines.append(f"- expected_workers_missing: {', '.join(missing)}")
 
     hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
+    usage_metrics = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
     for key in ("cpu", "ram", "net", "io"):
         entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
         node = entry.get("node")
         value = entry.get("value")
+        if not node or value is None:
+            usage = usage_metrics.get(key) if isinstance(usage_metrics.get(key), list) else []
+            pick = _node_usage_top(usage, allowed_nodes=None)
+            if pick:
+                node, value = pick
         if node and value is not None:
             value_fmt = _format_metric_value(
                 str(value),
@@ -3001,6 +3024,7 @@ def _ensure_scores(answer: str) -> str:
     def _record_score(key: str, value: str):
         if not value:
             return
+        value = value.strip().rstrip("%")
         score_map.setdefault(key, value)
 
     for line in lines:
@@ -3010,10 +3034,10 @@ def _ensure_scores(answer: str) -> str:
             "confidence" in lowered and "relevance" in lowered and "satisfaction" in lowered
         ):
             for key in ("confidence", "relevance", "satisfaction"):
-                match = re.search(rf"{key}\\s*[:=]?\\s*(\\d{{1,3}}|high|medium|low)", lowered)
+                match = re.search(rf"{key}\s*[:=]?\s*(\d{{1,3}}|high|medium|low)", lowered)
                 if match:
                     _record_score(key, match.group(1))
-            risk_match = re.search(r"hallucination\\s*risk\\s*[:=]?\\s*(low|medium|high)", lowered)
+            risk_match = re.search(r"hallucination\s*risk\s*[:=]?\s*(low|medium|high)", lowered)
             if risk_match:
                 _record_score("hallucinationrisk", risk_match.group(1))
             continue
@@ -3032,11 +3056,18 @@ def _ensure_scores(answer: str) -> str:
             _record_score("hallucinationrisk", _extract_value(cleaned))
             continue
         cleaned_body = re.sub(
-            r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*",
+            r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*",
             "",
             line,
             flags=re.IGNORECASE,
         ).strip()
+        cleaned_body = re.sub(
+            r"\bconfident\s*level\s*:\s*(high|medium|low)\b\.?\s*",
+            "",
+            cleaned_body,
+            flags=re.IGNORECASE,
+        ).strip()
+        cleaned_body = re.sub(r"\bF\d+\b", "", cleaned_body).strip()
         if cleaned_body:
             body_lines.append(cleaned_body)
 
@@ -3860,41 +3891,195 @@ def _has_body_lines(answer: str) -> bool:
 
 def _fallback_fact_answer(prompt: str, context: str) -> str:
     facts: list[str] = []
+    parsed_facts: list[tuple[str, str | None, str | None]] = []
+    q = normalize_query(prompt)
+    tokens = set(_tokens(prompt))
     for line in (context or "").splitlines():
         trimmed = line.strip()
-        if not trimmed.startswith("F"):
+        if not trimmed:
             continue
-        match = re.match(r"^F\\d+.*?\\]:\\s*(.*)$", trimmed)
-        if not match:
-            match = re.match(r"^F\\d+:\\s*(.*)$", trimmed)
-        if not match:
-            continue
-        fact = match.group(1).strip()
+        if trimmed.startswith("F"):
+            match = re.match(r"^F\d+.*?\]:\s*(.*)$", trimmed)
+            if not match:
+                match = re.match(r"^F\d+:\s*(.*)$", trimmed)
+            if not match:
+                continue
+            fact = match.group(1).strip()
+        else:
+            if trimmed.lower().startswith("fact pack") or trimmed.lower().startswith("facts"):
+                continue
+            if trimmed.startswith("-"):
+                fact = trimmed.lstrip("-").strip()
+            else:
+                fact = trimmed
         if fact.startswith("-"):
             fact = fact.lstrip("-").strip()
-        if fact:
+        if fact and (":" in fact or "=" in fact):
             facts.append(fact)
+            key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact)
+            if not key_match:
+                key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact)
+            if key_match:
+                parsed_facts.append((fact, key_match.group(1).strip(), key_match.group(2).strip()))
+            else:
+                parsed_facts.append((fact, None, None))
     if not facts:
         return ""
-    tokens = set(_tokens(prompt))
+
+    def _norm_key(text: str) -> str:
+        return normalize_query(text).replace(" ", "_")
+
+    def _find_value(target: str) -> str | None:
+        for _fact, key, val in parsed_facts:
+            if key and _norm_key(key) == target:
+                return val
+        return None
+
+    def _parse_counts(text: str) -> dict[str, int]:
+        counts: dict[str, int] = {}
+        for part in (text or "").split(","):
+            if "=" not in part:
+                continue
+            k, v = part.split("=", 1)
+            k = k.strip()
+            v = v.strip()
+            if not k or not v:
+                continue
+            try:
+                counts[k] = int(float(v))
+            except ValueError:
+                continue
+        return counts
+
+    def _parse_map(text: str) -> dict[str, str]:
+        mapping: dict[str, str] = {}
+        pattern = re.compile(r"(\w+)\s*=\s*([^=]+?)(?=(?:\s*,\s*\w+\s*=)|$)")
+        for match in pattern.finditer(text or ""):
+            mapping[match.group(1).strip()] = match.group(2).strip().strip(",")
+        return mapping
+
+    list_intent = _is_list_prompt(prompt) or "name" in tokens
+    count_intent = _is_quantitative_prompt(prompt) and ("how many" in q or "count" in tokens or "number" in tokens)
+    hottest_intent = any(word in q for word in ("hottest", "highest", "most", "top", "busiest"))
+    metric = _detect_metric(q)
+    include_hw, _exclude_hw = _detect_hardware_filters(q)
+
+    if hottest_intent and metric in {"cpu", "ram", "net", "io"}:
+        hottest_val = _find_value(f"hottest_{metric}")
+        if hottest_val:
+            return f"Hottest {metric} is {hottest_val}."
+    if hottest_intent and tokens & {"postgres", "database", "db", "connections"}:
+        hottest_db = _find_value("postgres_hottest_db")
+        if hottest_db:
+            return f"Hottest database is {hottest_db}."
+
+    if count_intent and tokens & {"pods", "pod"}:
+        pending = _find_value("pods_pending")
+        failed = _find_value("pods_failed")
+        running = _find_value("pods_running")
+        succeeded = _find_value("pods_succeeded")
+        if "pending" in q and "failed" in q:
+            try:
+                total = float(pending or 0) + float(failed or 0)
+                return f"Pods pending or failed: {total:.0f}."
+            except ValueError:
+                pass
+        if "pending" in q and pending is not None:
+            return f"Pods pending is {pending}."
+        if "failed" in q and failed is not None:
+            return f"Pods failed is {failed}."
+        if "succeeded" in q and succeeded is not None:
+            return f"Pods succeeded is {succeeded}."
+        if "running" in q and running is not None:
+            return f"Pods running is {running}."
+
+    if count_intent and tokens & {"nodes", "node"} and "not ready" in q:
+        nodes_total = _find_value("nodes_total")
+        if nodes_total and "not_ready" in nodes_total:
+            match = re.search(r"not_ready=([0-9.]+)", nodes_total)
+            if match:
+                return f"Not ready nodes: {match.group(1)}."
+
+    if count_intent and include_hw:
+        counts_line = _find_value("nodes_by_hardware_count")
+        if counts_line:
+            counts = _parse_counts(counts_line)
+            for hw in include_hw:
+                if hw in counts:
+                    return f"{hw} nodes: {counts[hw]}."
+        for hw in include_hw:
+            hw_line = _find_value(hw)
+            if hw_line:
+                items = [item.strip() for item in hw_line.split(",") if item.strip()]
+                return f"{hw} nodes: {len(items)}."
+
+    if list_intent and include_hw:
+        if "control" in q:
+            cp_by_hw = _find_value("control_plane_by_hardware")
+            if cp_by_hw:
+                mapping = _parse_map(cp_by_hw)
+                for hw in include_hw:
+                    if hw in mapping:
+                        return f"{hw} control-plane nodes: {mapping[hw]}."
+            cp_nodes = _find_value("control_plane_nodes")
+            if cp_nodes:
+                return f"Control-plane nodes: {cp_nodes}."
+        for hw in include_hw:
+            hw_line = _find_value(hw)
+            if hw_line:
+                return f"{hw} nodes: {hw_line}."
+
+    if list_intent and "control" in q:
+        cp_nodes = _find_value("control_plane_nodes")
+        if cp_nodes:
+            return f"Control-plane nodes: {cp_nodes}."
+
+    preferred = tokens & {
+        "node",
+        "nodes",
+        "pod",
+        "pods",
+        "postgres",
+        "db",
+        "database",
+        "namespace",
+        "workload",
+        "worker",
+        "workers",
+        "cpu",
+        "ram",
+        "memory",
+        "net",
+        "network",
+        "io",
+        "disk",
+        "connection",
+        "connections",
+    }
     best_fact = ""
     best_score = -1
     for fact in facts:
-        key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", fact)
+        key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", fact)
         if not key_match:
-            key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", fact)
+            key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", fact)
         key_tokens: set[str] = set()
         if key_match:
             key_tokens = set(_tokens(key_match.group(1)))
         score = len(tokens & set(_tokens(fact))) + 2 * len(tokens & key_tokens)
+        if preferred:
+            score += 3 * len(preferred & key_tokens)
+            if not (preferred & key_tokens):
+                score -= 1
+        if list_intent and key_match and "count" in key_tokens:
+            score -= 3
         if score > best_score:
             best_score = score
             best_fact = fact
     if best_score <= 0:
         return ""
-    key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+):\\s*(.+)$", best_fact)
+    key_match = re.match(r"^([\w\s/.-]+):\s*(.+)$", best_fact)
     if not key_match:
-        key_match = re.match(r"^([A-Za-z0-9_\\-/ ]+)=\\s*(.+)$", best_fact)
+        key_match = re.match(r"^([\w\s/.-]+)=\s*(.+)$", best_fact)
     if key_match:
         key = key_match.group(1).strip().replace("_", " ")
         val = key_match.group(2).strip()
@@ -3936,6 +4121,10 @@ def _needs_full_fact_pack(prompt: str) -> bool:
         return True
     if tokens & {"workload", "pods", "namespace", "worker", "workers"}:
         return True
+    if tokens & {"arch", "architecture", "hardware"}:
+        return True
+    if tokens & METRIC_HINT_WORDS:
+        return True
     if _NAME_INDEX and tokens & _NAME_INDEX:
         return True
     if any(phrase in q for phrase in ("where does", "where is", "where are", "running", "run on", "hosted on", "primary node")):
@@ -4104,7 +4293,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
         system_override=system,
         model=model,
     )
-    reply = re.sub(r"\\bconfidence\\s*:\\s*(high|medium|low)\\b\\.?\\s*", "", reply, flags=re.IGNORECASE).strip()
+    reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
     return _ensure_scores(reply)
 
 
@@ -4405,6 +4594,8 @@ def _is_cluster_query(
         return True
     if any(word in q for word in CLUSTER_HINT_WORDS):
         return True
+    if any(word in q for word in METRIC_HINT_WORDS):
+        return True
     for host_match in HOST_RE.finditer(q):
         host = host_match.group(1).lower()
         if host.endswith("bstein.dev"):