2026-01-13 20:23:26 +00:00
325 changed files with 37644 additions and 1317 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,8 @@
 *.md
 !README.md
+!knowledge/**/*.md
+!services/comms/knowledge/**/*.md
+__pycache__/
+*.py[cod]
+.pytest_cache
+.venv
--- a/clusters/atlas/applications/kustomization.yaml
+++ b/clusters/atlas/applications/kustomization.yaml
@ -5,8 +5,9 @@ resources:
  - ../../services/crypto
  - ../../services/gitea
  - ../../services/jellyfin
-  - ../../services/jitsi
+  - ../../services/comms
  - ../../services/monitoring
+  - ../../services/logging
  - ../../services/pegasus
  - ../../services/vault
  - ../../services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
@ -0,0 +1,23 @@
+# clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: ai-llm
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/ai-llm
+  targetNamespace: ai
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  wait: true
+  healthChecks:
+    - apiVersion: apps/v1
+      kind: Deployment
+      name: ollama
+      namespace: ai
+  dependsOn:
+    - name: core
--- a/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
@ -1,26 +0,0 @@
-# clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImageUpdateAutomation
-metadata:
-  name: ci-demo
-  namespace: flux-system
-spec:
-  interval: 1m0s
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  git:
-    checkout:
-      ref:
-        branch: feature/ci-gitops
-    commit:
-      author:
-        email: ops@bstein.dev
-        name: flux-bot
-      messageTemplate: "chore(ci-demo): apply image updates"
-    push:
-      branch: feature/ci-gitops
-  update:
-    strategy: Setters
-    path: services/ci-demo
--- a/clusters/atlas/flux-system/applications/comms/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/comms/kustomization.yaml
@ -0,0 +1,17 @@
+# clusters/atlas/flux-system/applications/communication/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: comms
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  path: ./services/comms
+  targetNamespace: comms
+  timeout: 2m
+  dependsOn:
+    - name: traefik
--- a/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml
@ -15,5 +15,6 @@ spec:
    namespace: flux-system
  dependsOn:
    - name: core
+    - name: openldap
  wait: true
  timeout: 5m
--- a/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml
@ -16,8 +16,12 @@ spec:
    - name: helm
    - name: traefik
  healthChecks:
-    - apiVersion: helm.toolkit.fluxcd.io/v2
-      kind: HelmRelease
+    - apiVersion: apps/v1
+      kind: Deployment
+      name: jenkins
+      namespace: jenkins
+    - apiVersion: v1
+      kind: Service
      name: jenkins
      namespace: jenkins
  wait: false
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -4,7 +4,8 @@ kind: Kustomization
 resources:
  - gitea/kustomization.yaml
  - vault/kustomization.yaml
-  - jitsi/kustomization.yaml
+  - vaultwarden/kustomization.yaml
+  - comms/kustomization.yaml
  - crypto/kustomization.yaml
  - monerod/kustomization.yaml
  - pegasus/kustomization.yaml
@ -16,9 +17,14 @@ resources:
  - jellyfin/kustomization.yaml
  - xmr-miner/kustomization.yaml
  - sui-metrics/kustomization.yaml
+  - openldap/kustomization.yaml
  - keycloak/kustomization.yaml
  - oauth2-proxy/kustomization.yaml
  - mailu/kustomization.yaml
  - jenkins/kustomization.yaml
-  - ci-demo/kustomization.yaml
-  - ci-demo/image-automation.yaml
+  - ai-llm/kustomization.yaml
+  - nextcloud/kustomization.yaml
+  - nextcloud-mail-sync/kustomization.yaml
+  - postgres/kustomization.yaml
+  - outline/kustomization.yaml
+  - planka/kustomization.yaml
--- a/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
@ -0,0 +1,17 @@
+# clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: nextcloud-mail-sync
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  path: ./services/nextcloud-mail-sync
+  targetNamespace: nextcloud
+  timeout: 2m
+  dependsOn:
+    - name: keycloak
--- a/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
@ -0,0 +1,16 @@
+# clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
+kind: Kustomization
+metadata:
+  name: nextcloud
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/nextcloud
+  targetNamespace: nextcloud
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  wait: true
--- a/clusters/atlas/flux-system/applications/openldap/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/openldap/kustomization.yaml
@ -1,18 +1,18 @@
-# clusters/atlas/flux-system/applications/jitsi/kustomization.yaml
+# clusters/atlas/flux-system/applications/openldap/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
-  name: jitsi
+  name: openldap
  namespace: flux-system
 spec:
  interval: 10m
-  path: ./services/jitsi
-  targetNamespace: jitsi
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
+  path: ./services/openldap
+  targetNamespace: sso
  dependsOn:
    - name: core
  wait: true
--- a/clusters/atlas/flux-system/applications/outline/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/outline/kustomization.yaml
@ -0,0 +1,28 @@
+# clusters/atlas/flux-system/applications/outline/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: outline
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/outline
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  targetNamespace: outline
+  dependsOn:
+    - name: keycloak
+    - name: mailu
+    - name: traefik
+  healthChecks:
+    - apiVersion: apps/v1
+      kind: Deployment
+      name: outline
+      namespace: outline
+    - apiVersion: v1
+      kind: Service
+      name: outline
+      namespace: outline
+  wait: false
--- a/clusters/atlas/flux-system/applications/planka/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/planka/kustomization.yaml
@ -0,0 +1,28 @@
+# clusters/atlas/flux-system/applications/planka/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: planka
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/planka
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  targetNamespace: planka
+  dependsOn:
+    - name: keycloak
+    - name: mailu
+    - name: traefik
+  healthChecks:
+    - apiVersion: apps/v1
+      kind: Deployment
+      name: planka
+      namespace: planka
+    - apiVersion: v1
+      kind: Service
+      name: planka
+      namespace: planka
+  wait: false
--- a/clusters/atlas/flux-system/applications/postgres/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/postgres/kustomization.yaml
@ -0,0 +1,24 @@
+# clusters/atlas/flux-system/applications/postgres/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: postgres
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/postgres
+  prune: true
+  force: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  targetNamespace: postgres
+  dependsOn:
+    - name: vault
+    - name: vault-csi
+  healthChecks:
+    - apiVersion: apps/v1
+      kind: StatefulSet
+      name: postgres
+      namespace: postgres
+  wait: true
--- a/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
@ -0,0 +1,20 @@
+# clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: vaultwarden
+  namespace: flux-system
+spec:
+  interval: 10m
+  suspend: false
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  path: ./services/vaultwarden
+  targetNamespace: vaultwarden
+  prune: true
+  wait: true
+  dependsOn:
+    - name: helm
+    - name: traefik
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@ -8,7 +8,7 @@ metadata:
 spec:
  interval: 1m0s
  ref:
-    branch: main
+    branch: feature/sso-hardening
  secretRef:
    name: flux-system-gitea
  url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@ -4,7 +4,11 @@ kind: Kustomization
 resources:
  - core/kustomization.yaml
  - helm/kustomization.yaml
+  - metallb/kustomization.yaml
  - traefik/kustomization.yaml
  - gitops-ui/kustomization.yaml
  - monitoring/kustomization.yaml
+  - logging/kustomization.yaml
+  - maintenance/kustomization.yaml
  - longhorn-ui/kustomization.yaml
+  - ../platform/vault-csi/kustomization.yaml
--- a/clusters/atlas/flux-system/platform/logging/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/logging/kustomization.yaml
@ -0,0 +1,14 @@
+# clusters/atlas/flux-system/platform/logging/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: logging
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./services/logging
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  wait: false
--- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
@ -1,17 +1,14 @@
-# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml
+# clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
-  name: ci-demo
+  name: maintenance
  namespace: flux-system
 spec:
  interval: 10m
-  path: ./services/ci-demo
+  path: ./services/maintenance
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
-    namespace: flux-system
-  dependsOn:
-    - name: core
  wait: false
--- a/clusters/atlas/flux-system/platform/metallb/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/metallb/kustomization.yaml
@ -0,0 +1,16 @@
+# clusters/atlas/flux-system/platform/metallb/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: metallb
+  namespace: flux-system
+spec:
+  interval: 30m
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  path: ./infrastructure/metallb
+  prune: true
+  wait: true
+  targetNamespace: metallb-system
--- a/clusters/atlas/flux-system/platform/traefik/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/traefik/kustomization.yaml
@ -15,4 +15,5 @@ spec:
    namespace: flux-system
  dependsOn:
    - name: core
+    - name: metallb
  wait: true
--- a/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
@ -0,0 +1,16 @@
+# clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: vault-csi
+  namespace: flux-system
+spec:
+  interval: 30m
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  path: ./infrastructure/vault-csi
+  prune: true
+  wait: true
+  targetNamespace: kube-system
--- a/clusters/atlas/platform/kustomization.yaml
+++ b/clusters/atlas/platform/kustomization.yaml
@ -5,3 +5,4 @@ resources:
  - ../../../infrastructure/modules/base
  - ../../../infrastructure/modules/profiles/atlas-ha
  - ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
+  - ../../../infrastructure/metallb
--- a/dockerfiles/Dockerfile.data-prepper
+++ b/dockerfiles/Dockerfile.data-prepper
@ -0,0 +1,16 @@
+FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
+
+FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
+
+ENV DATA_PREPPER_PATH=/usr/share/data-prepper
+
+RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
+  && mkdir -p /var/log/data-prepper
+
+COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
+
+RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
+
+USER 10001
+WORKDIR /usr/share/data-prepper
+CMD ["bin/data-prepper"]
--- a/hosts/roles/titan_jh/tasks/main.yaml
+++ b/hosts/roles/titan_jh/tasks/main.yaml
@ -1,5 +1,18 @@
 # hosts/roles/titan_jh/tasks/main.yaml
 ---
+- name: Install node exporter
+  ansible.builtin.package:
+    name: prometheus-node-exporter
+    state: present
+  tags: ['jumphost', 'monitoring']
+
+- name: Enable node exporter
+  ansible.builtin.service:
+    name: prometheus-node-exporter
+    enabled: true
+    state: started
+  tags: ['jumphost', 'monitoring']
+
 - name: Placeholder for jumphost hardening
  ansible.builtin.debug:
    msg: "Harden SSH, manage bastion tooling, and configure audit logging here."
--- a/infrastructure/metallb/ippool.yaml
+++ b/infrastructure/metallb/ippool.yaml
@ -0,0 +1,20 @@
+# infrastructure/metallb/ippool.yaml
+apiVersion: metallb.io/v1beta1
+kind: IPAddressPool
+metadata:
+  name: communication-pool
+  namespace: metallb-system
+spec:
+  addresses:
+    - 192.168.22.4-192.168.22.6
+    - 192.168.22.9-192.168.22.9
+  autoAssign: true
+---
+apiVersion: metallb.io/v1beta1
+kind: L2Advertisement
+metadata:
+  name: communication-adv
+  namespace: metallb-system
+spec:
+  ipAddressPools:
+    - communication-pool
--- a/infrastructure/metallb/kustomization.yaml
+++ b/infrastructure/metallb/kustomization.yaml
@ -0,0 +1,10 @@
+# infrastructure/metallb/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - namespace.yaml
+  - metallb-rendered.yaml
+  - ippool.yaml
+patchesStrategicMerge:
+  - patches/node-placement.yaml
+  - patches/speaker-loglevel.yaml
--- a/infrastructure/metallb/metallb-rendered.yaml
+++ b/infrastructure/metallb/metallb-rendered.yaml
--- a/infrastructure/metallb/namespace.yaml
+++ b/infrastructure/metallb/namespace.yaml
@ -0,0 +1,5 @@
+# infrastructure/metallb/namespace.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: metallb-system
--- a/infrastructure/metallb/patches/node-placement.yaml
+++ b/infrastructure/metallb/patches/node-placement.yaml
@ -0,0 +1,27 @@
+# infrastructure/metallb/patches/node-placement.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: metallb-controller
+  namespace: metallb-system
+spec:
+  template:
+    spec:
+      containers:
+        - name: controller
+          args:
+            - --port=7472
+            - --log-level=info
+            - --webhook-mode=enabled
+            - --tls-min-version=VersionTLS12
+            - --lb-class=metallb
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values:
+                      - rpi4
+                      - rpi5
--- a/infrastructure/metallb/patches/speaker-loglevel.yaml
+++ b/infrastructure/metallb/patches/speaker-loglevel.yaml
@ -0,0 +1,15 @@
+# infrastructure/metallb/patches/speaker-loglevel.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: metallb-speaker
+  namespace: metallb-system
+spec:
+  template:
+    spec:
+      containers:
+        - name: speaker
+          args:
+            - --port=7472
+            - --log-level=info
+            - --lb-class=metallb
--- a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml
+++ b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml
@ -2,6 +2,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - ../components/device-plugin-config
  - ../components/device-plugin-jetson
  - ../components/device-plugin-minipc
  - ../components/device-plugin-tethys
--- a/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
@ -0,0 +1,15 @@
+# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvidia-device-plugin-config
+  namespace: kube-system
+data:
+  config.yaml: |
+    version: v1
+    sharing:
+      timeSlicing:
+        renameByDefault: true
+        resources:
+          - name: nvidia.com/gpu
+            replicas: 4
--- a/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
@ -0,0 +1,5 @@
+# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - configmap.yaml
--- a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
@ -30,7 +30,8 @@ spec:
          imagePullPolicy: IfNotPresent
          args:
            - "--fail-on-init-error=false"
-            - "--device-list-strategy=envvar,cdi"
+            - "--device-list-strategy=envvar"
+            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -41,7 +42,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
+            - name: config
+              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
+        - name: config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
@ -32,6 +32,7 @@ spec:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar"
            - "--mig-strategy=none"
+            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -42,7 +43,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
+            - name: config
+              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
+        - name: config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
@ -33,6 +33,7 @@ spec:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar"
            - "--mig-strategy=none"
+            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -43,7 +44,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
+            - name: config
+              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
+        - name: config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
+++ b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
@ -2,4 +2,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - ../components/device-plugin-config
  - ../components/device-plugin-tethys
--- a/infrastructure/sources/helm/fluent-bit.yaml
+++ b/infrastructure/sources/helm/fluent-bit.yaml
@ -0,0 +1,9 @@
+# infrastructure/sources/helm/fluent-bit.yaml
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: fluent
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://fluent.github.io/helm-charts
--- a/infrastructure/sources/helm/kustomization.yaml
+++ b/infrastructure/sources/helm/kustomization.yaml
@ -2,11 +2,15 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - fluent-bit.yaml
  - grafana.yaml
  - hashicorp.yaml
  - jetstack.yaml
  - jenkins.yaml
  - mailu.yaml
+  - opentelemetry.yaml
+  - opensearch.yaml
  - harbor.yaml
  - prometheus.yaml
  - victoria-metrics.yaml
+  - secrets-store-csi.yaml
--- a/infrastructure/sources/helm/opensearch.yaml
+++ b/infrastructure/sources/helm/opensearch.yaml
@ -0,0 +1,9 @@
+# infrastructure/sources/helm/opensearch.yaml
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: opensearch
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://opensearch-project.github.io/helm-charts
--- a/infrastructure/sources/helm/opentelemetry.yaml
+++ b/infrastructure/sources/helm/opentelemetry.yaml
@ -0,0 +1,9 @@
+# infrastructure/sources/helm/opentelemetry.yaml
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: opentelemetry
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://open-telemetry.github.io/opentelemetry-helm-charts
--- a/infrastructure/sources/helm/secrets-store-csi.yaml
+++ b/infrastructure/sources/helm/secrets-store-csi.yaml
@ -0,0 +1,9 @@
+# infrastructure/sources/helm/secrets-store-csi.yaml
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: secrets-store-csi-driver
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://kubernetes-sigs.github.io/secrets-store-csi-driver/charts
--- a/infrastructure/traefik/clusterrole.yaml
+++ b/infrastructure/traefik/clusterrole.yaml
@ -71,9 +71,10 @@ rules:
  - tlsoptions
  - tlsstores
  - serverstransports
+  - serverstransporttcps
  - traefikservices
+  - middlewaretcps
  verbs: 
  - get
  - list
  - watch
-
--- a/infrastructure/traefik/kustomization.yaml
+++ b/infrastructure/traefik/kustomization.yaml
@ -10,3 +10,4 @@ resources:
  - clusterrole.yaml
  - clusterrolebinding.yaml
  - service.yaml
+  - traefik-service-lb.yaml
--- a/infrastructure/traefik/traefik-service-lb.yaml
+++ b/infrastructure/traefik/traefik-service-lb.yaml
@ -0,0 +1,24 @@
+# infrastructure/traefik/traefik-service-lb.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: traefik
+  namespace: kube-system
+  annotations:
+    metallb.universe.tf/address-pool: communication-pool
+spec:
+  type: LoadBalancer
+  loadBalancerClass: metallb
+  loadBalancerIP: 192.168.22.9
+  ports:
+    - name: web
+      port: 80
+      targetPort: web
+      protocol: TCP
+    - name: websecure
+      port: 443
+      targetPort: websecure
+      protocol: TCP
+  selector:
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
--- a/infrastructure/vault-csi/kustomization.yaml
+++ b/infrastructure/vault-csi/kustomization.yaml
@ -0,0 +1,6 @@
+# infrastructure/vault-csi/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - secrets-store-csi-driver.yaml
+  - vault-csi-provider.yaml
--- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml
+++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
@ -0,0 +1,20 @@
+# infrastructure/vault-csi/secrets-store-csi-driver.yaml
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: secrets-store-csi-driver
+  namespace: kube-system
+spec:
+  interval: 15m
+  chart:
+    spec:
+      chart: secrets-store-csi-driver
+      version: "~1.3.0"
+      sourceRef:
+        kind: HelmRepository
+        name: secrets-store-csi-driver
+        namespace: flux-system
+  values:
+    syncSecret:
+      enabled: true
+    enableSecretRotation: false
--- a/infrastructure/vault-csi/vault-csi-provider.yaml
+++ b/infrastructure/vault-csi/vault-csi-provider.yaml
@ -0,0 +1,111 @@
+# infrastructure/vault-csi/vault-csi-provider.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: vault-csi-provider
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: vault-csi-provider-clusterrole
+rules:
+  - apiGroups: [""]
+    resources: ["serviceaccounts/token"]
+    verbs: ["create"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: vault-csi-provider-clusterrolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: vault-csi-provider-clusterrole
+subjects:
+  - kind: ServiceAccount
+    name: vault-csi-provider
+    namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: vault-csi-provider-role
+  namespace: kube-system
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get"]
+    resourceNames: ["vault-csi-provider-hmac-key"]
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["create"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: vault-csi-provider-rolebinding
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: vault-csi-provider-role
+subjects:
+  - kind: ServiceAccount
+    name: vault-csi-provider
+    namespace: kube-system
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: vault-csi-provider
+  namespace: kube-system
+  labels: { app.kubernetes.io/name: vault-csi-provider }
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels: { app.kubernetes.io/name: vault-csi-provider }
+  template:
+    metadata:
+      labels: { app.kubernetes.io/name: vault-csi-provider }
+    spec:
+      serviceAccountName: vault-csi-provider
+      containers:
+        - name: provider-vault-installer
+          image: hashicorp/vault-csi-provider:1.7.0
+          imagePullPolicy: IfNotPresent
+          args:
+            - -endpoint=/provider/vault.sock
+            - -log-level=info
+          resources:
+            requests: { cpu: 50m, memory: 100Mi }
+            limits:   { cpu: 50m, memory: 100Mi }
+          volumeMounts:
+            - { name: providervol, mountPath: "/provider" }
+          livenessProbe:
+            httpGet:
+              path: "/health/ready"
+              port: 8080
+              scheme: "HTTP"
+            failureThreshold: 2
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 3
+          readinessProbe:
+            httpGet:
+              path: "/health/ready"
+              port: 8080
+              scheme: "HTTP"
+            failureThreshold: 2
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 3
+      volumes:
+        - name: providervol
+          hostPath:
+            path: "/var/run/secrets-store-csi-providers"
+      nodeSelector:
+        kubernetes.io/os: linux
--- a/knowledge/INDEX.md
+++ b/knowledge/INDEX.md
@ -0,0 +1,22 @@
+Atlas Knowledge Base (KB)
+
+This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
+- Accurate (grounded in GitOps + read-only cluster tools)
+- Maintainable (small docs + deterministic generators)
+- Safe (no secrets; refer to Secret/Vault paths by name only)
+
+Layout
+- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
+- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
+- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
+
+Regeneration
+- Update manifests/docs, then regenerate generated artifacts:
+  - `python scripts/knowledge_render_atlas.py --write`
+
+Authoring rules
+- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
+- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
+- Keep each runbook small; one topic per file; use headings.
+- When in doubt, link to the exact file path in this repo that configures the behavior.
+
--- a/knowledge/catalog/atlas-summary.json
+++ b/knowledge/catalog/atlas-summary.json
@ -0,0 +1,8 @@
+{
+  "counts": {
+    "helmrelease_host_hints": 7,
+    "http_endpoints": 35,
+    "services": 44,
+    "workloads": 49
+  }
+}
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
--- a/knowledge/catalog/runbooks.json
+++ b/knowledge/catalog/runbooks.json
@ -0,0 +1,89 @@
+[
+  {
+    "path": "runbooks/ci-gitea-jenkins.md",
+    "title": "CI: Gitea \u2192 Jenkins pipeline",
+    "tags": [
+      "atlas",
+      "ci",
+      "gitea",
+      "jenkins"
+    ],
+    "entrypoints": [
+      "scm.bstein.dev",
+      "ci.bstein.dev"
+    ],
+    "source_paths": [
+      "services/gitea",
+      "services/jenkins",
+      "scripts/jenkins_cred_sync.sh",
+      "scripts/gitea_cred_sync.sh"
+    ],
+    "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured."
+  },
+  {
+    "path": "runbooks/comms-verify.md",
+    "title": "Othrys verification checklist",
+    "tags": [
+      "comms",
+      "matrix",
+      "element",
+      "livekit"
+    ],
+    "entrypoints": [
+      "https://live.bstein.dev",
+      "https://matrix.live.bstein.dev"
+    ],
+    "source_paths": [],
+    "body": "1) Guest join:\n- Open a private window and visit:\n  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN."
+  },
+  {
+    "path": "runbooks/kb-authoring.md",
+    "title": "KB authoring: what to write (and what not to)",
+    "tags": [
+      "atlas",
+      "kb",
+      "runbooks"
+    ],
+    "entrypoints": [],
+    "source_paths": [
+      "knowledge/runbooks",
+      "scripts/knowledge_render_atlas.py"
+    ],
+    "body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d"
+  },
+  {
+    "path": "runbooks/observability.md",
+    "title": "Observability: Grafana + VictoriaMetrics (how to query safely)",
+    "tags": [
+      "atlas",
+      "monitoring",
+      "grafana",
+      "victoriametrics"
+    ],
+    "entrypoints": [
+      "metrics.bstein.dev",
+      "alerts.bstein.dev"
+    ],
+    "source_paths": [
+      "services/monitoring"
+    ],
+    "body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions."
+  },
+  {
+    "path": "runbooks/template.md",
+    "title": "<short title>",
+    "tags": [
+      "atlas",
+      "<service>",
+      "<topic>"
+    ],
+    "entrypoints": [
+      "<hostnames if relevant>"
+    ],
+    "source_paths": [
+      "services/<svc>",
+      "clusters/atlas/<...>"
+    ],
+    "body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
+  }
+]
--- a/knowledge/diagrams/atlas-http.mmd
+++ b/knowledge/diagrams/atlas-http.mmd
@ -0,0 +1,189 @@
+flowchart LR
+  host_auth_bstein_dev["auth.bstein.dev"]
+  svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
+  host_auth_bstein_dev --> svc_sso_oauth2_proxy
+  wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
+  svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
+  host_bstein_dev["bstein.dev"]
+  svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
+  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
+  wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
+  svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
+  svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
+  host_bstein_dev --> svc_comms_matrix_wellknown
+  wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
+  svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
+  svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
+  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
+  wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
+  svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
+  host_call_live_bstein_dev["call.live.bstein.dev"]
+  svc_comms_element_call["comms/element-call (Service)"]
+  host_call_live_bstein_dev --> svc_comms_element_call
+  wl_comms_element_call["comms/element-call (Deployment)"]
+  svc_comms_element_call --> wl_comms_element_call
+  host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
+  svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
+  host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
+  wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
+  svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
+  host_ci_bstein_dev["ci.bstein.dev"]
+  svc_jenkins_jenkins["jenkins/jenkins (Service)"]
+  host_ci_bstein_dev --> svc_jenkins_jenkins
+  wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
+  svc_jenkins_jenkins --> wl_jenkins_jenkins
+  host_cloud_bstein_dev["cloud.bstein.dev"]
+  svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
+  host_cloud_bstein_dev --> svc_nextcloud_nextcloud
+  wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
+  svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
+  host_kit_live_bstein_dev["kit.live.bstein.dev"]
+  svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
+  host_kit_live_bstein_dev --> svc_comms_livekit_token_service
+  wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
+  svc_comms_livekit_token_service --> wl_comms_livekit_token_service
+  svc_comms_livekit["comms/livekit (Service)"]
+  host_kit_live_bstein_dev --> svc_comms_livekit
+  wl_comms_livekit["comms/livekit (Deployment)"]
+  svc_comms_livekit --> wl_comms_livekit
+  host_live_bstein_dev["live.bstein.dev"]
+  svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
+  host_live_bstein_dev --> svc_comms_othrys_element_element_web
+  wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
+  svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
+  host_live_bstein_dev --> svc_comms_matrix_wellknown
+  svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
+  host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
+  svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
+  host_longhorn_bstein_dev["longhorn.bstein.dev"]
+  svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
+  host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
+  wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
+  svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
+  host_mail_bstein_dev["mail.bstein.dev"]
+  svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
+  host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
+  host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
+  host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
+  host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
+  host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
+  host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
+  host_monero_bstein_dev["monero.bstein.dev"]
+  svc_crypto_monerod["crypto/monerod (Service)"]
+  host_monero_bstein_dev --> svc_crypto_monerod
+  wl_crypto_monerod["crypto/monerod (Deployment)"]
+  svc_crypto_monerod --> wl_crypto_monerod
+  host_office_bstein_dev["office.bstein.dev"]
+  svc_nextcloud_collabora["nextcloud/collabora (Service)"]
+  host_office_bstein_dev --> svc_nextcloud_collabora
+  wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
+  svc_nextcloud_collabora --> wl_nextcloud_collabora
+  host_pegasus_bstein_dev["pegasus.bstein.dev"]
+  svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
+  host_pegasus_bstein_dev --> svc_jellyfin_pegasus
+  wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
+  svc_jellyfin_pegasus --> wl_jellyfin_pegasus
+  host_scm_bstein_dev["scm.bstein.dev"]
+  svc_gitea_gitea["gitea/gitea (Service)"]
+  host_scm_bstein_dev --> svc_gitea_gitea
+  wl_gitea_gitea["gitea/gitea (Deployment)"]
+  svc_gitea_gitea --> wl_gitea_gitea
+  host_secret_bstein_dev["secret.bstein.dev"]
+  svc_vault_vault["vault/vault (Service)"]
+  host_secret_bstein_dev --> svc_vault_vault
+  wl_vault_vault["vault/vault (StatefulSet)"]
+  svc_vault_vault --> wl_vault_vault
+  host_sso_bstein_dev["sso.bstein.dev"]
+  svc_sso_keycloak["sso/keycloak (Service)"]
+  host_sso_bstein_dev --> svc_sso_keycloak
+  wl_sso_keycloak["sso/keycloak (Deployment)"]
+  svc_sso_keycloak --> wl_sso_keycloak
+  host_stream_bstein_dev["stream.bstein.dev"]
+  svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
+  host_stream_bstein_dev --> svc_jellyfin_jellyfin
+  wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
+  svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
+  host_vault_bstein_dev["vault.bstein.dev"]
+  svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
+  host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
+  wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
+  svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
+
+  subgraph bstein_dev_home[bstein-dev-home]
+    svc_bstein_dev_home_bstein_dev_home_frontend
+    wl_bstein_dev_home_bstein_dev_home_frontend
+    svc_bstein_dev_home_bstein_dev_home_backend
+    wl_bstein_dev_home_bstein_dev_home_backend
+    svc_bstein_dev_home_chat_ai_gateway
+    wl_bstein_dev_home_chat_ai_gateway
+  end
+  subgraph comms[comms]
+    svc_comms_matrix_wellknown
+    wl_comms_matrix_wellknown
+    svc_comms_element_call
+    wl_comms_element_call
+    svc_comms_livekit_token_service
+    wl_comms_livekit_token_service
+    svc_comms_livekit
+    wl_comms_livekit
+    svc_comms_othrys_element_element_web
+    wl_comms_othrys_element_element_web
+    svc_comms_othrys_synapse_matrix_synapse
+    wl_comms_othrys_synapse_matrix_synapse
+    svc_comms_matrix_authentication_service
+    wl_comms_matrix_authentication_service
+    svc_comms_matrix_guest_register
+    wl_comms_matrix_guest_register
+  end
+  subgraph crypto[crypto]
+    svc_crypto_monerod
+    wl_crypto_monerod
+  end
+  subgraph gitea[gitea]
+    svc_gitea_gitea
+    wl_gitea_gitea
+  end
+  subgraph jellyfin[jellyfin]
+    svc_jellyfin_pegasus
+    wl_jellyfin_pegasus
+    svc_jellyfin_jellyfin
+    wl_jellyfin_jellyfin
+  end
+  subgraph jenkins[jenkins]
+    svc_jenkins_jenkins
+    wl_jenkins_jenkins
+  end
+  subgraph longhorn_system[longhorn-system]
+    svc_longhorn_system_oauth2_proxy_longhorn
+    wl_longhorn_system_oauth2_proxy_longhorn
+  end
+  subgraph mailu_mailserver[mailu-mailserver]
+    svc_mailu_mailserver_mailu_front
+  end
+  subgraph nextcloud[nextcloud]
+    svc_nextcloud_nextcloud
+    wl_nextcloud_nextcloud
+    svc_nextcloud_collabora
+    wl_nextcloud_collabora
+  end
+  subgraph sso[sso]
+    svc_sso_oauth2_proxy
+    wl_sso_oauth2_proxy
+    svc_sso_keycloak
+    wl_sso_keycloak
+  end
+  subgraph vault[vault]
+    svc_vault_vault
+    wl_vault_vault
+  end
+  subgraph vaultwarden[vaultwarden]
+    svc_vaultwarden_vaultwarden_service
+    wl_vaultwarden_vaultwarden
+  end
--- a/knowledge/metis.md
+++ b/knowledge/metis.md
@ -0,0 +1,26 @@
+# Metis (node recovery)
+
+## Node classes (current map)
+- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
+- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
+- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
+- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
+- amd64 agents: titan-22/24 (Debian 13, k3s agent)
+- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
+
+## Longhorn disk UUIDs (critical nodes)
+- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
+- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
+- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
+- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
+
+## Metis repo (~/Development/metis)
+- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
+- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
+- `AGENTS.md` in repo is untracked and holds raw notes.
+
+## Next implementation steps
+- Add per-class golden image refs and checksums (Harbor or file://) when ready.
+- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
+- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
+- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
--- a/knowledge/runbooks/ci-gitea-jenkins.md
+++ b/knowledge/runbooks/ci-gitea-jenkins.md
@ -0,0 +1,27 @@
+---
+title: "CI: Gitea → Jenkins pipeline"
+tags: ["atlas", "ci", "gitea", "jenkins"]
+owners: ["brad"]
+entrypoints: ["scm.bstein.dev", "ci.bstein.dev"]
+source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"]
+---
+
+# CI: Gitea → Jenkins pipeline
+
+## What this is
+Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).
+
+## Where it is configured
+- Gitea manifests: `services/gitea/`
+- Jenkins manifests: `services/jenkins/`
+- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`
+
+## What users do (typical flow)
+- Create a repo in Gitea.
+- Create/update a Jenkins job/pipeline that can fetch the repo.
+- Configure a webhook (or SCM polling) so pushes trigger builds.
+
+## Troubleshooting (common)
+- “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable.
+- “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured.
+
--- a/knowledge/runbooks/comms-verify.md
+++ b/knowledge/runbooks/comms-verify.md
@ -0,0 +1,30 @@
+---
+title: Othrys verification checklist
+tags:
+  - comms
+  - matrix
+  - element
+  - livekit
+entrypoints:
+  - https://live.bstein.dev
+  - https://matrix.live.bstein.dev
+---
+
+1) Guest join:
+- Open a private window and visit:
+  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
+- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
+
+2) Keycloak login:
+- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
+
+3) Video rooms:
+- Start an Element Call room and confirm audio/video with a second account.
+- Check that guests can read public rooms but cannot start calls.
+
+4) Well-known:
+- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
+- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
+
+5) TURN reachability:
+- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.
--- a/knowledge/runbooks/kb-authoring.md
+++ b/knowledge/runbooks/kb-authoring.md
@ -0,0 +1,34 @@
+---
+title: "KB authoring: what to write (and what not to)"
+tags: ["atlas", "kb", "runbooks"]
+owners: ["brad"]
+entrypoints: []
+source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"]
+---
+
+# KB authoring: what to write (and what not to)
+
+## The goal
+Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing.
+
+## What to capture (high value)
+- User workflows: “click here, set X, expected result”
+- Operator workflows: “edit these files, reconcile this kustomization, verify with these commands”
+- Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc”
+- Failure modes: exact error messages + the 2–5 checks that usually resolve them
+- Permissions: Keycloak groups/roles and what they unlock
+
+## What to avoid (low value / fluff)
+- Generic Kubernetes explanations (link to upstream docs instead)
+- Copy-pasting large manifests (prefer file paths + small snippets)
+- Anything that will drift quickly (render it from GitOps instead)
+- Any secret values (reference Secret/Vault locations by name only)
+
+## Document pattern (recommended)
+Each runbook should answer:
+- “What is this?”
+- “What do users do?”
+- “What do operators change (where in Git)?”
+- “How do we verify it works?”
+- “What breaks and how to debug it?”
+
--- a/knowledge/runbooks/observability.md
+++ b/knowledge/runbooks/observability.md
@ -0,0 +1,26 @@
+---
+title: "Observability: Grafana + VictoriaMetrics (how to query safely)"
+tags: ["atlas", "monitoring", "grafana", "victoriametrics"]
+owners: ["brad"]
+entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"]
+source_paths: ["services/monitoring"]
+---
+
+# Observability: Grafana + VictoriaMetrics (how to query safely)
+
+## Where it is configured
+- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)
+- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)
+
+## Using metrics as a “tool” for Atlas assistants
+The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results.
+
+Examples (intents)
+- “Is the cluster healthy?” → node readiness + pod restart rate
+- “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx
+- “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported)
+
+## Why dashboards are not the KB
+Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the
+KB focused on wiring, runbooks, and stable conventions.
+
--- a/knowledge/runbooks/template.md
+++ b/knowledge/runbooks/template.md
@ -0,0 +1,18 @@
+---
+title: "<short title>"
+tags: ["atlas", "<service>", "<topic>"]
+owners: ["brad"]
+entrypoints: ["<hostnames if relevant>"]
+source_paths: ["services/<svc>", "clusters/atlas/<...>"]
+---
+
+# <Short title>
+
+## What this is
+
+## For users (how to)
+
+## For operators (where configured)
+
+## Troubleshooting (symptoms → checks)
+
--- a/knowledge/software/metis.md
+++ b/knowledge/software/metis.md
@ -0,0 +1,73 @@
+# Metis (node recovery)
+
+## Node classes (current map)
+- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
+- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
+- rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
+- rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
+- amd64 agents: titan-22/24 (Debian 13, k3s agent)
+- External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
+
+### Jetson nodes (titan-20/21)
+- Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
+- Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
+- k3s agent with drop-in 99-nofile.conf.
+
+## Longhorn disk UUIDs (critical nodes)
+- titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
+- titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
+- titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
+- titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
+
+## Metis repo (~/Development/metis)
+- CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
+- `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
+- `AGENTS.md` in repo is untracked and holds raw notes.
+
+## Next implementation steps
+- Add per-class golden image refs and checksums (Harbor or file://) when ready.
+- Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
+- Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
+- Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
+
+## Node OS/Kernel/CRI snapshot (Jan 2026)
+- titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
+- titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
+- titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
+- titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
+
+
+### External hosts
+- titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
+- titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
+- titan-23/oceanus: TODO audit (future).
+
+
+### Control plane Pis (titan-0a/0b/0c)
+- Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
+- Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
+- k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
+
+
+## k3s versions
+- rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
+- rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
+- Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2
--- a/scripts/comms_sync_kb.sh
+++ b/scripts/comms_sync_kb.sh
@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python scripts/knowledge_render_atlas.py --write
+python scripts/knowledge_render_atlas.py --write --out services/comms/knowledge
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -9,6 +9,7 @@ Usage:
 import argparse
 import json
 import textwrap
+import urllib.parse
 from pathlib import Path

 # ---------------------------------------------------------------------------
@ -45,12 +46,14 @@ PERCENT_THRESHOLDS = {
    ],
 }

+NAMESPACE_CPU_WINDOW = "1m"
+
 # ---------------------------------------------------------------------------
 # Cluster metadata
 # ---------------------------------------------------------------------------

 CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
-CONTROL_DEPENDENCIES = ["titan-db"]
+CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"]
 CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
 WORKER_NODES = [
    "titan-04",
@ -61,11 +64,12 @@ WORKER_NODES = [
    "titan-09",
    "titan-10",
    "titan-11",
+    "titan-20",
+    "titan-21",
    "titan-12",
    "titan-13",
    "titan-14",
    "titan-15",
-    "titan-16",
    "titan-17",
    "titan-18",
    "titan-19",
@ -80,7 +84,22 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
 WORKER_TOTAL = len(WORKER_NODES)
 CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
 WORKER_SUFFIX = f"/{WORKER_TOTAL}"
-CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
+# Namespaces considered infrastructure (excluded from workload counts)
+INFRA_NAMESPACES = [
+    "kube-system",
+    "longhorn-system",
+    "metallb-system",
+    "monitoring",
+    "logging",
+    "cert-manager",
+    "flux-system",
+    "traefik",
+    "maintenance",
+    "postgres",
+]
+INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
+# Namespaces allowed on control plane without counting as workloads
+CP_ALLOWED_NS = INFRA_REGEX
 LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
 GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
 CONTROL_WORKLOADS_EXPR = (
@ -170,22 +189,48 @@ def node_io_expr(scope=""):
    return scoped_node_expr(base, scope)


+def namespace_selector(scope_var):
+    return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}'
+
+
+def namespace_gpu_selector(scope_var):
+    return f'namespace!="",pod!="",{scope_var}'
+
+
+def namespace_cpu_raw(scope_var):
+    return (
+        "sum(rate(container_cpu_usage_seconds_total"
+        f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)"
+    )
+
+
+def namespace_ram_raw(scope_var):
+    return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
+
+
+def namespace_gpu_usage_instant(scope_var):
+    return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
+
+
 def namespace_share_expr(resource_expr):
-    selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
-    total = f"clamp_min(sum( {selected} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
+    total = f"clamp_min(sum( {resource_expr} ), 1)"
+    return f"100 * ( {resource_expr} ) / {total}"


-def namespace_cpu_share_expr():
-    return namespace_share_expr(NAMESPACE_CPU_RAW)
+def namespace_cpu_share_expr(scope_var):
+    return namespace_share_expr(namespace_cpu_raw(scope_var))


-def namespace_ram_share_expr():
-    return namespace_share_expr(NAMESPACE_RAM_RAW)
+def namespace_ram_share_expr(scope_var):
+    return namespace_share_expr(namespace_ram_raw(scope_var))


-def namespace_gpu_share_expr():
-    return namespace_share_expr(NAMESPACE_GPU_RAW)
+def namespace_gpu_share_expr(scope_var):
+    usage = namespace_gpu_usage_instant(scope_var)
+    total = f"(sum({usage}) or on() vector(0))"
+    share = f"100 * ({usage}) / clamp_min({total}, 1)"
+    idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
+    return f"({share}) or ({idle})"


 PROBLEM_PODS_EXPR = (
@ -270,46 +315,12 @@ STUCK_TABLE_EXPR = (
    ")"
 )

-NAMESPACE_CPU_RAW = (
-    'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
-)
-NAMESPACE_RAM_RAW = (
-    'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
-)
+NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
+NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
+NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
+NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
-NAMESPACE_GPU_ALLOC = (
-    'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
-    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
-)
-NAMESPACE_GPU_USAGE_SHARE = (
-    'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
-)
-NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
-NAMESPACE_GPU_RAW = (
-    "("
-    + NAMESPACE_GPU_USAGE_SHARE
-    + ") or on(namespace) ("
-    + NAMESPACE_CPU_RAW
-    + " * 0)"
-)
-NAMESPACE_GPU_WEIGHT = (
-    "("
-    + NAMESPACE_GPU_ALLOC
-    + ") or on(namespace) ("
-    + NAMESPACE_CPU_RAW
-    + " * 0)"
-)
-NAMESPACE_ACTIVITY_SCORE = (
-    "( "
-    + NAMESPACE_CPU_RAW
-    + " ) + ("
-    + NAMESPACE_RAM_RAW
-    + " / 1e9) + ("
-    + NAMESPACE_GPU_WEIGHT
-    + " * 100)"
-)
-NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 TRAEFIK_NET_INGRESS = (
    'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
@ -560,9 +571,9 @@ def table_panel(
    return panel


-def pie_panel(panel_id, title, expr, grid):
+def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
    """Return a pie chart panel with readable namespace labels."""
-    return {
+    panel = {
        "id": panel_id,
        "type": "piechart",
        "title": title,
@ -586,6 +597,71 @@ def pie_panel(panel_id, title, expr, grid):
            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
        },
    }
+    if links:
+        panel["links"] = links
+    if description:
+        panel["description"] = description
+    return panel
+
+
+def namespace_scope_variable(var_name, label):
+    options = [
+        {
+            "text": "workload namespaces only",
+            "value": NAMESPACE_SCOPE_WORKLOAD,
+            "selected": True,
+        },
+        {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
+        {
+            "text": "infrastructure namespaces only",
+            "value": NAMESPACE_SCOPE_INFRA,
+            "selected": False,
+        },
+    ]
+    query = (
+        "workload namespaces only : "
+        + NAMESPACE_SCOPE_WORKLOAD
+        + ",all namespaces : "
+        + NAMESPACE_SCOPE_ALL
+        + ",infrastructure namespaces only : "
+        + NAMESPACE_SCOPE_INFRA
+    )
+    return {
+        "name": var_name,
+        "label": label,
+        "type": "custom",
+        "query": query,
+        "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
+        "options": options,
+        "hide": 2,
+        "multi": False,
+        "includeAll": False,
+        "refresh": 1,
+        "sort": 0,
+        "skipUrlSync": False,
+    }
+
+
+def namespace_scope_links(var_name):
+    def with_value(value):
+        encoded = urllib.parse.quote(value, safe="")
+        params = []
+        for other in NAMESPACE_SCOPE_VARS:
+            if other == var_name:
+                params.append(f"var-{other}={encoded}")
+            else:
+                params.append(f"var-{other}=${{{other}}}")
+        return "?" + "&".join(params)
+
+    return [
+        {"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
+        {"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
+        {
+            "title": "Infrastructure namespaces only",
+            "url": with_value(NAMESPACE_SCOPE_INFRA),
+            "targetBlank": False,
+        },
+    ]


 def bargauge_panel(
@ -857,6 +933,115 @@ def build_overview():
            )
        )

+    mail_bounce_rate_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 5},
+            {"color": "orange", "value": 8},
+            {"color": "red", "value": 10},
+        ],
+    }
+    mail_limit_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 70},
+            {"color": "orange", "value": 85},
+            {"color": "red", "value": 95},
+        ],
+    }
+    mail_success_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "red", "value": None},
+            {"color": "orange", "value": 90},
+            {"color": "yellow", "value": 95},
+            {"color": "green", "value": 98},
+        ],
+    }
+    panels.append(
+        stat_panel(
+            30,
+            "Mail Sent (1d)",
+            'max(postmark_outbound_sent{window="1d"})',
+            {"h": 2, "w": 6, "x": 0, "y": 8},
+            unit="none",
+            links=link_to("atlas-mail"),
+        )
+    )
+    panels.append(
+        {
+            "id": 31,
+            "type": "stat",
+            "title": "Mail Bounces (1d)",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
+            "targets": [
+                {
+                    "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
+                    "refId": "A",
+                    "legendFormat": "Rate",
+                },
+                {
+                    "expr": 'max(postmark_outbound_bounced{window="1d"})',
+                    "refId": "B",
+                    "legendFormat": "Count",
+                },
+            ],
+            "fieldConfig": {
+                "defaults": {
+                    "color": {"mode": "thresholds"},
+                    "custom": {"displayMode": "auto"},
+                    "thresholds": mail_bounce_rate_thresholds,
+                    "unit": "none",
+                },
+                "overrides": [
+                    {
+                        "matcher": {"id": "byName", "options": "Rate"},
+                        "properties": [{"id": "unit", "value": "percent"}],
+                    },
+                    {
+                        "matcher": {"id": "byName", "options": "Count"},
+                        "properties": [{"id": "unit", "value": "none"}],
+                    },
+                ],
+            },
+            "options": {
+                "colorMode": "value",
+                "graphMode": "area",
+                "justifyMode": "center",
+                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
+                "textMode": "name_and_value",
+            },
+            "links": link_to("atlas-mail"),
+        }
+    )
+    panels.append(
+        stat_panel(
+            32,
+            "Mail Success Rate (1d)",
+            'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
+            {"h": 2, "w": 6, "x": 6, "y": 8},
+            unit="percent",
+            thresholds=mail_success_thresholds,
+            decimals=1,
+            links=link_to("atlas-mail"),
+        )
+    )
+    panels.append(
+        stat_panel(
+            33,
+            "Mail Limit Used (30d)",
+            "max(postmark_sending_limit_used_percent)",
+            {"h": 2, "w": 6, "x": 18, "y": 8},
+            unit="percent",
+            thresholds=mail_limit_thresholds,
+            decimals=1,
+            links=link_to("atlas-mail"),
+        )
+    )
+
    storage_panels = [
        (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
        (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
@ -876,28 +1061,38 @@ def build_overview():
            )
        )

+    cpu_scope = "$namespace_scope_cpu"
+    gpu_scope = "$namespace_scope_gpu"
+    ram_scope = "$namespace_scope_ram"
+
    panels.append(
        pie_panel(
            11,
            "Namespace CPU Share",
-            namespace_cpu_share_expr(),
+            namespace_cpu_share_expr(cpu_scope),
            {"h": 9, "w": 8, "x": 0, "y": 16},
+            links=namespace_scope_links("namespace_scope_cpu"),
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
        pie_panel(
            12,
            "Namespace GPU Share",
-            namespace_gpu_share_expr(),
+            namespace_gpu_share_expr(gpu_scope),
            {"h": 9, "w": 8, "x": 8, "y": 16},
+            links=namespace_scope_links("namespace_scope_gpu"),
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
        pie_panel(
            13,
            "Namespace RAM Share",
-            namespace_ram_share_expr(),
+            namespace_ram_share_expr(ram_scope),
            {"h": 9, "w": 8, "x": 16, "y": 16},
+            links=namespace_scope_links("namespace_scope_ram"),
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )

@ -1052,7 +1247,6 @@ def build_overview():
            links=link_to("atlas-storage"),
        )
    )
-
    return {
        "uid": "atlas-overview",
        "title": "Atlas Overview",
@ -1063,7 +1257,13 @@ def build_overview():
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "overview"],
-        "templating": {"list": []},
+        "templating": {
+            "list": [
+                namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
+                namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
+                namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
+            ]
+        },
        "time": {"from": "now-1h", "to": "now"},
        "refresh": "1m",
        "links": [],
@ -1513,6 +1713,33 @@ def build_storage_dashboard():
            time_from="90d",
        )
    )
+    panels.append(
+        stat_panel(
+            30,
+            "Maintenance Sweepers Ready",
+            'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
+            {"h": 4, "w": 12, "x": 0, "y": 44},
+            unit="percent",
+            thresholds=PERCENT_THRESHOLDS,
+        )
+    )
+    panels.append(
+        stat_panel(
+            31,
+            "Maintenance Cron Freshness (s)",
+            'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
+            {"h": 4, "w": 12, "x": 12, "y": 44},
+            unit="s",
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "yellow", "value": 3600},
+                    {"color": "red", "value": 10800},
+                ],
+            },
+        )
+    )
    return {
        "uid": "atlas-storage",
        "title": "Atlas Storage",
@ -1702,21 +1929,231 @@ def build_network_dashboard():
    }


+def build_mail_dashboard():
+    panels = []
+
+    bounce_rate_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 5},
+            {"color": "orange", "value": 8},
+            {"color": "red", "value": 10},
+        ],
+    }
+    limit_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 70},
+            {"color": "orange", "value": 85},
+            {"color": "red", "value": 95},
+        ],
+    }
+    success_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "red", "value": None},
+            {"color": "orange", "value": 90},
+            {"color": "yellow", "value": 95},
+            {"color": "green", "value": 98},
+        ],
+    }
+
+    panels.append(
+        stat_panel(
+            1,
+            "Sent (1d)",
+            'max(postmark_outbound_sent{window="1d"})',
+            {"h": 4, "w": 6, "x": 0, "y": 0},
+            decimals=0,
+        )
+    )
+    panels.append(
+        stat_panel(
+            2,
+            "Sent (7d)",
+            'max(postmark_outbound_sent{window="7d"})',
+            {"h": 4, "w": 6, "x": 6, "y": 0},
+            decimals=0,
+        )
+    )
+    panels.append(
+        {
+            "id": 3,
+            "type": "stat",
+            "title": "Mail Bounces (1d)",
+            "datasource": PROM_DS,
+            "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
+            "targets": [
+                {
+                    "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
+                    "refId": "A",
+                    "legendFormat": "Rate",
+                },
+                {
+                    "expr": 'max(postmark_outbound_bounced{window="1d"})',
+                    "refId": "B",
+                    "legendFormat": "Count",
+                },
+            ],
+            "fieldConfig": {
+                "defaults": {
+                    "color": {"mode": "thresholds"},
+                    "custom": {"displayMode": "auto"},
+                    "thresholds": bounce_rate_thresholds,
+                    "unit": "none",
+                },
+                "overrides": [
+                    {
+                        "matcher": {"id": "byName", "options": "Rate"},
+                        "properties": [{"id": "unit", "value": "percent"}],
+                    },
+                    {
+                        "matcher": {"id": "byName", "options": "Count"},
+                        "properties": [{"id": "unit", "value": "none"}],
+                    },
+                ],
+            },
+            "options": {
+                "colorMode": "value",
+                "graphMode": "area",
+                "justifyMode": "center",
+                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
+                "textMode": "name_and_value",
+            },
+        }
+    )
+    panels.append(
+        stat_panel(
+            4,
+            "Success Rate (1d)",
+            'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
+            {"h": 4, "w": 6, "x": 18, "y": 0},
+            unit="percent",
+            thresholds=success_thresholds,
+            decimals=1,
+        )
+    )
+
+    panels.append(
+        stat_panel(
+            5,
+            "Limit Used (30d)",
+            "max(postmark_sending_limit_used_percent)",
+            {"h": 4, "w": 6, "x": 0, "y": 4},
+            thresholds=limit_thresholds,
+            unit="percent",
+            decimals=1,
+        )
+    )
+    panels.append(
+        stat_panel(
+            6,
+            "Send Limit (30d)",
+            "max(postmark_sending_limit)",
+            {"h": 4, "w": 6, "x": 6, "y": 4},
+            decimals=0,
+        )
+    )
+    panels.append(
+        stat_panel(
+            7,
+            "Last Success",
+            "max(postmark_last_success_timestamp_seconds)",
+            {"h": 4, "w": 6, "x": 12, "y": 4},
+            unit="dateTimeAsIso",
+            decimals=0,
+        )
+    )
+    panels.append(
+        stat_panel(
+            8,
+            "Exporter Errors",
+            "sum(postmark_request_errors_total)",
+            {"h": 4, "w": 6, "x": 18, "y": 4},
+            decimals=0,
+        )
+    )
+
+    panels.append(
+        timeseries_panel(
+            13,
+            "Bounce Rate (1d vs 7d)",
+            "max by (window) (postmark_outbound_bounce_rate)",
+            {"h": 8, "w": 12, "x": 0, "y": 12},
+            unit="percent",
+            legend="{{window}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            14,
+            "Bounced (1d vs 7d)",
+            "max by (window) (postmark_outbound_bounced)",
+            {"h": 8, "w": 12, "x": 12, "y": 12},
+            unit="none",
+            legend="{{window}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            15,
+            "Sent (1d vs 7d)",
+            "max by (window) (postmark_outbound_sent)",
+            {"h": 8, "w": 12, "x": 0, "y": 20},
+            unit="none",
+            legend="{{window}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            16,
+            "Exporter Errors",
+            "sum(postmark_request_errors_total)",
+            {"h": 8, "w": 12, "x": 12, "y": 20},
+            unit="none",
+        )
+    )
+
+    return {
+        "uid": "atlas-mail",
+        "title": "Atlas Mail",
+        "folderUid": PRIVATE_FOLDER,
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-30d", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "mail"],
+    }
+
+
 def build_gpu_dashboard():
    panels = []
+    gpu_scope = "$namespace_scope_gpu"
    panels.append(
        pie_panel(
            1,
            "Namespace GPU Share",
-            namespace_gpu_share_expr(),
+            namespace_gpu_share_expr(gpu_scope),
            {"h": 8, "w": 12, "x": 0, "y": 0},
+            links=namespace_scope_links("namespace_scope_gpu"),
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
        timeseries_panel(
            2,
            "GPU Util by Namespace",
-            NAMESPACE_GPU_USAGE_INSTANT,
+            namespace_gpu_usage_instant(gpu_scope),
            {"h": 8, "w": 12, "x": 12, "y": 0},
            unit="percent",
            legend="{{namespace}}",
@ -1757,6 +2194,13 @@ def build_gpu_dashboard():
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "gpu"],
+        "templating": {
+            "list": [
+                namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
+                namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
+                namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
+            ]
+        },
    }


@ -1781,6 +2225,10 @@ DASHBOARDS = {
        "builder": build_network_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
    },
+    "atlas-mail": {
+        "builder": build_mail_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
+    },
    "atlas-gpu": {
        "builder": build_gpu_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
--- a/scripts/dashboards_render_logs.py
+++ b/scripts/dashboards_render_logs.py
@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+"""Generate OpenSearch Dashboards saved objects and render them into ConfigMaps.
+
+Usage:
+  scripts/dashboards_render_logs.py --build   # rebuild NDJSON + ConfigMap
+  scripts/dashboards_render_logs.py           # re-render ConfigMap from NDJSON
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import textwrap
+from dataclasses import dataclass
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+DASHBOARD_DIR = ROOT / "services" / "logging" / "dashboards"
+NDJSON_PATH = DASHBOARD_DIR / "logs.ndjson"
+CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-dashboards-objects.yaml"
+
+CONFIG_TEMPLATE = textwrap.dedent(
+    """# {relative_path}
+# Generated by scripts/dashboards_render_logs.py --build
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: opensearch-dashboards-objects
+  namespace: logging
+data:
+  objects.ndjson: |
+{payload}
+"""
+)
+
+DASHBOARD_VERSION = "7.10.0"
+GRID_COLUMNS = 48
+H_CHART = 10
+H_ERRORS = 8
+H_TABLE = 16
+H_SEARCH = 18
+TABLE_SIZE = 15
+TABLE_PER_PAGE = 15
+
+ERROR_TERMS = ("*error*", "*exception*", "*fail*")
+
+
+@dataclass(frozen=True)
+class AppSpec:
+    slug: str
+    title: str
+    query: str
+    index_id: str = "kube-logs"
+    kind: str = "kube"
+
+
+def error_query(base: str | None = None) -> str:
+    parts = [f'(log : "{term}" or message : "{term}")' for term in ERROR_TERMS]
+    expr = " or ".join(parts)
+    if base:
+        return f"({base}) and ({expr})"
+    return f"({expr})"
+
+
+def json_line(obj: dict) -> str:
+    return json.dumps(obj, separators=(",", ":"))
+
+
+def search_source(query: str) -> dict:
+    return {
+        "query": {"language": "kuery", "query": query},
+        "filter": [],
+        "indexRefName": "kibanaSavedObjectMeta.searchSourceJSON.index",
+    }
+
+
+def index_pattern(object_id: str, title: str, time_field: str = "@timestamp") -> dict:
+    return {
+        "type": "index-pattern",
+        "id": object_id,
+        "attributes": {"title": title, "timeFieldName": time_field},
+    }
+
+
+def histogram_vis(object_id: str, title: str, query: str, index_id: str) -> dict:
+    vis_state = {
+        "title": title,
+        "type": "histogram",
+        "aggs": [
+            {"id": "1", "enabled": True, "type": "count", "schema": "metric"},
+            {
+                "id": "2",
+                "enabled": True,
+                "type": "date_histogram",
+                "schema": "segment",
+                "params": {"field": "@timestamp", "interval": "auto", "min_doc_count": 1},
+            },
+        ],
+        "params": {"addTooltip": True, "addLegend": False, "scale": "linear", "interpolate": "linear"},
+    }
+    return {
+        "type": "visualization",
+        "id": object_id,
+        "attributes": {
+            "title": title,
+            "visState": json.dumps(vis_state, separators=(",", ":")),
+            "uiStateJSON": "{}",
+            "description": "",
+            "version": 1,
+            "kibanaSavedObjectMeta": {
+                "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
+            },
+        },
+        "references": [
+            {
+                "name": "kibanaSavedObjectMeta.searchSourceJSON.index",
+                "type": "index-pattern",
+                "id": index_id,
+            }
+        ],
+    }
+
+
+def table_vis(object_id: str, title: str, field: str, query: str, index_id: str) -> dict:
+    vis_state = {
+        "title": title,
+        "type": "table",
+        "aggs": [
+            {"id": "1", "enabled": True, "type": "count", "schema": "metric"},
+            {
+                "id": "2",
+                "enabled": True,
+                "type": "terms",
+                "schema": "bucket",
+                "params": {"field": field, "size": TABLE_SIZE, "order": "desc", "orderBy": "1"},
+            },
+        ],
+        "params": {
+            "perPage": TABLE_PER_PAGE,
+            "showPartialRows": False,
+            "showMetricsAtAllLevels": False,
+            "sort": {"columnIndex": 1, "direction": "desc"},
+        },
+    }
+    return {
+        "type": "visualization",
+        "id": object_id,
+        "attributes": {
+            "title": title,
+            "visState": json.dumps(vis_state, separators=(",", ":")),
+            "uiStateJSON": "{}",
+            "description": "",
+            "version": 1,
+            "kibanaSavedObjectMeta": {
+                "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
+            },
+        },
+        "references": [
+            {
+                "name": "kibanaSavedObjectMeta.searchSourceJSON.index",
+                "type": "index-pattern",
+                "id": index_id,
+            }
+        ],
+    }
+
+
+def search_object(object_id: str, title: str, columns: list[str], query: str, index_id: str) -> dict:
+    return {
+        "type": "search",
+        "id": object_id,
+        "attributes": {
+            "title": title,
+            "description": "",
+            "columns": columns,
+            "sort": [["@timestamp", "desc"]],
+            "kibanaSavedObjectMeta": {
+                "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
+            },
+        },
+        "references": [
+            {
+                "name": "kibanaSavedObjectMeta.searchSourceJSON.index",
+                "type": "index-pattern",
+                "id": index_id,
+            }
+        ],
+    }
+
+
+def grid(x: int, y: int, w: int, h: int, i: int) -> dict:
+    return {"x": x, "y": y, "w": w, "h": h, "i": str(i)}
+
+
+def panel(panel_id: str, panel_type: str, grid_data: dict, index: int) -> dict:
+    return {
+        "panelIndex": str(index),
+        "gridData": grid_data,
+        "id": panel_id,
+        "type": panel_type,
+        "version": DASHBOARD_VERSION,
+        "embeddableConfig": {},
+    }
+
+
+def full_width_panels(specs: list[tuple[str, str, int]]) -> list[dict]:
+    panels = []
+    y = 0
+    for index, (panel_id, panel_type, height) in enumerate(specs, start=1):
+        panels.append(panel(panel_id, panel_type, grid(0, y, GRID_COLUMNS, height, index), index))
+        y += height
+    return panels
+
+
+def dashboard_object(object_id: str, title: str, panels: list[dict]) -> dict:
+    return {
+        "type": "dashboard",
+        "id": object_id,
+        "attributes": {
+            "title": title,
+            "description": "",
+            "hits": 0,
+            "panelsJSON": json.dumps(panels, separators=(",", ":")),
+            "optionsJSON": json.dumps({"useMargins": True, "hidePanelTitles": False}, separators=(",", ":")),
+            "version": 1,
+            "timeRestore": False,
+            "kibanaSavedObjectMeta": {
+                "searchSourceJSON": json.dumps({"query": {"language": "kuery", "query": ""}, "filter": []})
+            },
+        },
+    }
+
+
+def app_dashboard_objects(app: AppSpec) -> list[dict]:
+    prefix = f"logs-{app.slug}"
+    objects = []
+
+    if app.kind == "journald":
+        columns = ["@timestamp", "_HOSTNAME", "_SYSTEMD_UNIT", "MESSAGE"]
+        objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
+        objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
+        objects.append(table_vis(f"{prefix}-top-units", "Top units", "_SYSTEMD_UNIT.keyword", app.query, app.index_id))
+        objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
+        objects.append(
+            search_object(
+                f"{prefix}-recent-errors",
+                "Recent errors",
+                columns,
+                error_query(app.query),
+                app.index_id,
+            )
+        )
+        panels = full_width_panels(
+            [
+                (f"{prefix}-volume", "visualization", H_CHART),
+                (f"{prefix}-errors", "visualization", H_ERRORS),
+                (f"{prefix}-top-units", "visualization", H_TABLE),
+                (f"{prefix}-recent", "search", H_SEARCH),
+                (f"{prefix}-recent-errors", "search", H_SEARCH),
+            ]
+        )
+        objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
+        return objects
+
+    columns = ["@timestamp", "kubernetes.pod_name", "kubernetes.container_name", "log", "message"]
+    objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
+    objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
+    objects.append(table_vis(f"{prefix}-top-pods", "Top pods", "kubernetes.pod_name.keyword", app.query, app.index_id))
+    objects.append(
+        table_vis(f"{prefix}-top-containers", "Top containers", "kubernetes.container_name.keyword", app.query, app.index_id)
+    )
+    objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
+    objects.append(
+        search_object(
+            f"{prefix}-recent-errors",
+            "Recent errors",
+            columns,
+            error_query(app.query),
+            app.index_id,
+        )
+    )
+    panels = full_width_panels(
+        [
+            (f"{prefix}-volume", "visualization", H_CHART),
+            (f"{prefix}-errors", "visualization", H_ERRORS),
+            (f"{prefix}-top-pods", "visualization", H_TABLE),
+            (f"{prefix}-top-containers", "visualization", H_TABLE),
+            (f"{prefix}-recent", "search", H_SEARCH),
+            (f"{prefix}-recent-errors", "search", H_SEARCH),
+        ]
+    )
+    objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
+    return objects
+
+
+def overview_objects() -> list[dict]:
+    objects = []
+    objects.append(histogram_vis("logs-overview-volume", "Logs per minute", "*", "kube-logs"))
+    objects.append(histogram_vis("logs-overview-errors", "Errors per minute", error_query(), "kube-logs"))
+    objects.append(
+        table_vis(
+            "logs-overview-top-ns",
+            "Top namespaces",
+            "kubernetes.namespace_name.keyword",
+            "*",
+            "kube-logs",
+        )
+    )
+    objects.append(
+        table_vis(
+            "logs-overview-top-error-ns",
+            "Top error namespaces",
+            "kubernetes.namespace_name.keyword",
+            error_query(),
+            "kube-logs",
+        )
+    )
+    objects.append(table_vis("logs-overview-top-pods", "Top pods", "kubernetes.pod_name.keyword", "*", "kube-logs"))
+    objects.append(
+        table_vis(
+            "logs-overview-top-nodes",
+            "Top nodes",
+            "kubernetes.node_name.keyword",
+            "*",
+            "kube-logs",
+        )
+    )
+    objects.append(
+        search_object(
+            "logs-overview-recent-errors",
+            "Recent errors",
+            ["@timestamp", "kubernetes.namespace_name", "kubernetes.pod_name", "log", "message"],
+            error_query(),
+            "kube-logs",
+        )
+    )
+    panels = full_width_panels(
+        [
+            ("logs-overview-volume", "visualization", H_CHART),
+            ("logs-overview-errors", "visualization", H_ERRORS),
+            ("logs-overview-top-ns", "visualization", H_TABLE),
+            ("logs-overview-top-error-ns", "visualization", H_TABLE),
+            ("logs-overview-top-pods", "visualization", H_TABLE),
+            ("logs-overview-top-nodes", "visualization", H_TABLE),
+            ("logs-overview-recent-errors", "search", H_SEARCH),
+        ]
+    )
+    objects.append(dashboard_object("logs-overview", "Atlas Logs Overview", panels))
+    return objects
+
+
+def build_objects() -> list[dict]:
+    objects = [
+        index_pattern("kube-logs", "kube-*"),
+        index_pattern("journald-logs", "journald-*"),
+    ]
+
+    objects.extend(overview_objects())
+
+    apps = [
+        AppSpec("bstein-dev-home", "bstein-dev-home", 'kubernetes.namespace_name: "bstein-dev-home"'),
+        AppSpec(
+            "pegasus",
+            "pegasus",
+            'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "pegasus"',
+        ),
+        AppSpec(
+            "jellyfin",
+            "jellyfin",
+            'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "jellyfin"',
+        ),
+        AppSpec("vaultwarden", "vaultwarden", 'kubernetes.namespace_name: "vaultwarden"'),
+        AppSpec("mailu", "mailu", 'kubernetes.namespace_name: "mailu-mailserver"'),
+        AppSpec("nextcloud", "nextcloud", 'kubernetes.namespace_name: "nextcloud"'),
+        AppSpec("gitea", "gitea", 'kubernetes.namespace_name: "gitea"'),
+        AppSpec("jenkins", "jenkins", 'kubernetes.namespace_name: "jenkins"'),
+        AppSpec("harbor", "harbor", 'kubernetes.namespace_name: "harbor"'),
+        AppSpec("vault", "vault", 'kubernetes.namespace_name: "vault"'),
+        AppSpec("keycloak", "keycloak", 'kubernetes.namespace_name: "sso"'),
+        AppSpec("flux-system", "flux-system", 'kubernetes.namespace_name: "flux-system"'),
+        AppSpec("comms", "comms", 'kubernetes.namespace_name: "comms"'),
+        AppSpec(
+            "element-web",
+            "element-web",
+            'kubernetes.namespace_name: "comms" and kubernetes.container_name: "element-web"',
+        ),
+        AppSpec(
+            "element-call",
+            "element-call",
+            'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "element-call"',
+        ),
+        AppSpec(
+            "matrix-synapse",
+            "matrix-synapse",
+            'kubernetes.namespace_name: "comms" and kubernetes.container_name: "synapse"',
+        ),
+        AppSpec(
+            "livekit",
+            "livekit",
+            'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "livekit"',
+        ),
+        AppSpec(
+            "coturn",
+            "coturn",
+            'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "coturn"',
+        ),
+        AppSpec("lesavka", "lesavka", '_HOSTNAME: "titan-jh"', index_id="journald-logs", kind="journald"),
+    ]
+
+    for app in apps:
+        objects.extend(app_dashboard_objects(app))
+
+    return objects
+
+
+def write_ndjson(objects: list[dict], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = "\n".join(json_line(obj) for obj in objects)
+    path.write_text(payload + "\n")
+
+
+def render_configmap(ndjson_path: Path, output_path: Path) -> None:
+    payload_lines = ndjson_path.read_text().splitlines()
+    payload = "\n".join("    " + line for line in payload_lines)
+    relative_path = output_path.relative_to(ROOT)
+    output_path.write_text(CONFIG_TEMPLATE.format(relative_path=relative_path, payload=payload))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--build", action="store_true", help="Regenerate saved object NDJSON and ConfigMap")
+    args = parser.parse_args()
+
+    if args.build:
+        objects = build_objects()
+        write_ndjson(objects, NDJSON_PATH)
+
+    if not NDJSON_PATH.exists():
+        raise SystemExit(f"Missing NDJSON file: {NDJSON_PATH}. Run with --build first.")
+
+    render_configmap(NDJSON_PATH, CONFIG_PATH)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@ -0,0 +1,554 @@
+#!/usr/bin/env python3
+"""Render Atlas knowledge artifacts from Flux + kustomize manifests.
+
+Outputs (committed to git for stable diffs + RAG):
+- knowledge/catalog/*.yaml
+- knowledge/diagrams/*.mmd
+
+This is intentionally conservative:
+- never includes Secret objects
+- never includes secret values
+- keeps output deterministic (sorted)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+import yaml
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+
+CLUSTER_SCOPED_KINDS = {
+    "Namespace",
+    "Node",
+    "CustomResourceDefinition",
+    "ClusterRole",
+    "ClusterRoleBinding",
+    "StorageClass",
+    "PersistentVolume",
+    "MutatingWebhookConfiguration",
+    "ValidatingWebhookConfiguration",
+    "APIService",
+}
+
+INCLUDED_KINDS = {
+    "Namespace",
+    "Deployment",
+    "StatefulSet",
+    "DaemonSet",
+    "Service",
+    "Ingress",
+    "IngressRoute",  # traefik
+    "HelmRelease",  # only to harvest ingress hostnames from values
+}
+
+
+def _run(cmd: list[str], *, cwd: Path) -> str:
+    res = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, check=False)
+    if res.returncode != 0:
+        raise RuntimeError(
+            f"Command failed ({res.returncode}): {' '.join(cmd)}\n{res.stderr.strip()}"
+        )
+    return res.stdout
+
+
+def kustomize_build(path: Path) -> str:
+    rel = path.relative_to(REPO_ROOT)
+    try:
+        return _run(["kubectl", "kustomize", str(rel)], cwd=REPO_ROOT)
+    except Exception as e:
+        msg = str(e)
+        if "is not in or below" in msg:
+            # Repo uses configMapGenerators that reference ../../scripts/*.py.
+            # Kustomize load restriction must be disabled for a full render.
+            try:
+                return _run(
+                    ["kubectl", "kustomize", "--load-restrictor=LoadRestrictionsNone", str(rel)],
+                    cwd=REPO_ROOT,
+                )
+            except Exception:
+                pass
+        return _run(["kustomize", "build", "--load-restrictor=LoadRestrictionsNone", str(rel)], cwd=REPO_ROOT)
+
+
+def _iter_docs(raw_yaml: str) -> Iterable[dict[str, Any]]:
+    for doc in yaml.safe_load_all(raw_yaml):
+        if not isinstance(doc, dict):
+            continue
+        kind = doc.get("kind")
+        if kind == "List" and isinstance(doc.get("items"), list):
+            for item in doc["items"]:
+                if isinstance(item, dict):
+                    yield item
+            continue
+        if kind:
+            yield doc
+
+
+def _meta(doc: dict[str, Any]) -> tuple[str, str | None]:
+    md = doc.get("metadata") or {}
+    name = md.get("name") or ""
+    namespace = md.get("namespace")
+    return name, namespace
+
+
+def _is_namespaced(doc: dict[str, Any]) -> bool:
+    kind = doc.get("kind") or ""
+    return kind not in CLUSTER_SCOPED_KINDS
+
+
+@dataclass(frozen=True)
+class FluxKustomization:
+    name: str
+    path: str
+    target_namespace: str | None
+
+
+def find_flux_kustomizations() -> list[FluxKustomization]:
+    """Find Flux Kustomization CRs under clusters/atlas/flux-system."""
+    root = REPO_ROOT / "clusters" / "atlas" / "flux-system"
+    items: list[FluxKustomization] = []
+    for file in sorted(root.rglob("*.yaml")):
+        raw = file.read_text()
+        for doc in _iter_docs(raw):
+            if doc.get("kind") != "Kustomization":
+                continue
+            api = str(doc.get("apiVersion") or "")
+            if not api.startswith("kustomize.toolkit.fluxcd.io/"):
+                continue
+            name, _ = _meta(doc)
+            spec = doc.get("spec") or {}
+            path = spec.get("path")
+            if not isinstance(path, str) or not path.strip():
+                continue
+            items.append(
+                FluxKustomization(
+                    name=name,
+                    path=path.strip().lstrip("./"),
+                    target_namespace=spec.get("targetNamespace"),
+                )
+            )
+    return sorted(items, key=lambda k: k.name)
+
+
+def _safe_string_scan_for_hosts(value: Any) -> set[str]:
+    """Best-effort host scan from HelmRelease values without chart rendering."""
+    hosts: set[str] = set()
+    if isinstance(value, str):
+        for m in re.finditer(r"(?i)([a-z0-9-]+(?:\.[a-z0-9-]+)+)", value):
+            host = m.group(1).lower()
+            if host.endswith("bstein.dev"):
+                hosts.add(host)
+        return hosts
+    if isinstance(value, list):
+        for item in value:
+            hosts |= _safe_string_scan_for_hosts(item)
+        return hosts
+    if isinstance(value, dict):
+        for item in value.values():
+            hosts |= _safe_string_scan_for_hosts(item)
+        return hosts
+    return hosts
+
+
+def _service_ports(svc: dict[str, Any]) -> list[dict[str, Any]]:
+    spec = svc.get("spec") or {}
+    out: list[dict[str, Any]] = []
+    for p in spec.get("ports") or []:
+        if not isinstance(p, dict):
+            continue
+        out.append(
+            {
+                "name": p.get("name"),
+                "port": p.get("port"),
+                "targetPort": p.get("targetPort"),
+                "protocol": p.get("protocol", "TCP"),
+            }
+        )
+    return out
+
+
+def _workload_labels(doc: dict[str, Any]) -> dict[str, str]:
+    tpl = (doc.get("spec") or {}).get("template") or {}
+    md = tpl.get("metadata") or {}
+    labels = md.get("labels") or {}
+    return {str(k): str(v) for k, v in labels.items()} if isinstance(labels, dict) else {}
+
+
+def _service_selector(doc: dict[str, Any]) -> dict[str, str]:
+    spec = doc.get("spec") or {}
+    sel = spec.get("selector") or {}
+    return {str(k): str(v) for k, v in sel.items()} if isinstance(sel, dict) else {}
+
+
+def _selector_matches(selector: dict[str, str], labels: dict[str, str]) -> bool:
+    if not selector:
+        return False
+    return all(labels.get(k) == v for k, v in selector.items())
+
+
+def _sanitize_node_id(text: str) -> str:
+    return re.sub(r"[^a-zA-Z0-9_]", "_", text)
+
+
+def extract_catalog(
+    rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]],
+) -> tuple[dict[str, Any], dict[str, Any], str]:
+    """Build knowledge catalog + mermaid diagram from rendered docs."""
+    # Index workloads and services for mapping.
+    workloads: dict[tuple[str, str], dict[str, Any]] = {}
+    services: dict[tuple[str, str], dict[str, Any]] = {}
+    ingresses: list[dict[str, Any]] = []
+    ingressroutes: list[dict[str, Any]] = []
+    helmrelease_hosts: dict[str, list[str]] = {}
+
+    for src, docs in rendered:
+        for doc in docs:
+            kind = doc.get("kind")
+            if kind not in INCLUDED_KINDS:
+                continue
+            if kind == "Secret":
+                continue
+
+            name, namespace = _meta(doc)
+            if _is_namespaced(doc) and not namespace and src.target_namespace:
+                namespace = src.target_namespace
+                doc = dict(doc)
+                doc.setdefault("metadata", {})["namespace"] = namespace
+
+            if kind in ("Deployment", "StatefulSet", "DaemonSet"):
+                workloads[(namespace or "", name)] = {
+                    "kind": kind,
+                    "namespace": namespace or "",
+                    "name": name,
+                    "labels": _workload_labels(doc),
+                    "serviceAccountName": ((doc.get("spec") or {}).get("template") or {})
+                    .get("spec", {})
+                    .get("serviceAccountName"),
+                    "nodeSelector": ((doc.get("spec") or {}).get("template") or {})
+                    .get("spec", {})
+                    .get("nodeSelector", {}),
+                    "images": sorted(
+                        {
+                            c.get("image")
+                            for c in (
+                                (((doc.get("spec") or {}).get("template") or {}).get("spec") or {}).get(
+                                    "containers"
+                                )
+                                or []
+                            )
+                            if isinstance(c, dict) and c.get("image")
+                        }
+                    ),
+                }
+            elif kind == "Service":
+                services[(namespace or "", name)] = {
+                    "namespace": namespace or "",
+                    "name": name,
+                    "type": (doc.get("spec") or {}).get("type", "ClusterIP"),
+                    "selector": _service_selector(doc),
+                    "ports": _service_ports(doc),
+                }
+            elif kind == "Ingress":
+                ingresses.append({"source": src.name, "doc": doc})
+            elif kind == "IngressRoute":
+                ingressroutes.append({"source": src.name, "doc": doc})
+            elif kind == "HelmRelease":
+                spec = doc.get("spec") or {}
+                vals = spec.get("values") or {}
+                hosts = sorted(_safe_string_scan_for_hosts(vals))
+                if hosts:
+                    helmrelease_hosts[f"{src.name}:{namespace or ''}/{name}"] = hosts
+
+    # Map services to workloads.
+    service_to_workloads: dict[tuple[str, str], list[dict[str, str]]] = {}
+    for (ns, svc_name), svc in services.items():
+        selector = svc.get("selector") or {}
+        matches: list[dict[str, str]] = []
+        for (w_ns, w_name), w in workloads.items():
+            if w_ns != ns:
+                continue
+            if _selector_matches(selector, w.get("labels") or {}):
+                matches.append({"kind": w["kind"], "name": w_name})
+        service_to_workloads[(ns, svc_name)] = sorted(matches, key=lambda m: (m["kind"], m["name"]))
+
+    # Extract HTTP endpoints.
+    endpoints: list[dict[str, Any]] = []
+
+    def add_endpoint(
+        *,
+        host: str,
+        path: str,
+        namespace: str,
+        service: str,
+        port: Any,
+        source: str,
+        kind: str,
+        obj_name: str,
+    ):
+        wk = service_to_workloads.get((namespace, service), [])
+        endpoints.append(
+            {
+                "host": host,
+                "path": path,
+                "backend": {
+                    "namespace": namespace,
+                    "service": service,
+                    "port": port,
+                    "workloads": wk,
+                },
+                "via": {"kind": kind, "name": obj_name, "source": source},
+            }
+        )
+
+    for item in ingresses:
+        doc = item["doc"]
+        source = item["source"]
+        name, namespace = _meta(doc)
+        namespace = namespace or ""
+        spec = doc.get("spec") or {}
+        for rule in spec.get("rules") or []:
+            if not isinstance(rule, dict):
+                continue
+            host = (rule.get("host") or "").strip()
+            http = rule.get("http") or {}
+            for p in http.get("paths") or []:
+                if not isinstance(p, dict):
+                    continue
+                backend = (p.get("backend") or {}).get("service") or {}
+                svc_name = backend.get("name")
+                svc_port = (backend.get("port") or {}).get("number") or (backend.get("port") or {}).get("name")
+                if not host or not svc_name:
+                    continue
+                add_endpoint(
+                    host=host,
+                    path=p.get("path") or "/",
+                    namespace=namespace,
+                    service=svc_name,
+                    port=svc_port,
+                    source=source,
+                    kind="Ingress",
+                    obj_name=name,
+                )
+
+    host_re = re.compile(r"Host\(`([^`]+)`\)")
+    pathprefix_re = re.compile(r"PathPrefix\(`([^`]+)`\)")
+    for item in ingressroutes:
+        doc = item["doc"]
+        source = item["source"]
+        name, namespace = _meta(doc)
+        namespace = namespace or ""
+        spec = doc.get("spec") or {}
+        for route in spec.get("routes") or []:
+            if not isinstance(route, dict):
+                continue
+            match = route.get("match") or ""
+            hosts = host_re.findall(match)
+            pathprefixes = pathprefix_re.findall(match) or ["/"]
+            for svc in route.get("services") or []:
+                if not isinstance(svc, dict):
+                    continue
+                svc_name = svc.get("name")
+                svc_port = svc.get("port")
+                if not svc_name:
+                    continue
+                for host in hosts:
+                    for pp in pathprefixes:
+                        add_endpoint(
+                            host=host,
+                            path=pp,
+                            namespace=namespace,
+                            service=svc_name,
+                            port=svc_port,
+                            source=source,
+                            kind="IngressRoute",
+                            obj_name=name,
+                        )
+
+    endpoints = sorted(
+        endpoints,
+        key=lambda e: (
+            e["host"],
+            e["path"],
+            e["backend"]["namespace"],
+            e["backend"]["service"],
+        ),
+    )
+
+    catalog = {
+        "cluster": "atlas",
+        "sources": [
+            {"name": k.name, "path": k.path, "targetNamespace": k.target_namespace}
+            for k, _ in rendered
+        ],
+        "workloads": sorted(
+            list(workloads.values()),
+            key=lambda w: (w["namespace"], w["kind"], w["name"]),
+        ),
+        "services": sorted(
+            list(services.values()),
+            key=lambda s: (s["namespace"], s["name"]),
+        ),
+        "http_endpoints": endpoints,
+        "helmrelease_host_hints": {k: v for k, v in sorted(helmrelease_hosts.items())},
+    }
+
+    # Mermaid diagram: host -> service -> workload (grouped by namespace).
+    ns_nodes: dict[str, list[str]] = {}
+    lines: list[str] = ["flowchart LR"]
+    edges: set[tuple[str, str]] = set()
+
+    def ensure_ns_node(ns: str, node_id: str):
+        ns_nodes.setdefault(ns, [])
+        if node_id not in ns_nodes[ns]:
+            ns_nodes[ns].append(node_id)
+
+    host_nodes: dict[str, str] = {}
+
+    for ep in endpoints:
+        host = ep["host"]
+        host_id = host_nodes.get(host)
+        if not host_id:
+            host_id = f"host_{_sanitize_node_id(host)}"
+            host_nodes[host] = host_id
+            lines.append(f'  {host_id}["{host}"]')
+
+        ns = ep["backend"]["namespace"]
+        svc = ep["backend"]["service"]
+        svc_id = f"svc_{_sanitize_node_id(ns)}_{_sanitize_node_id(svc)}"
+        if svc_id not in ns_nodes.get(ns, []):
+            lines.append(f'  {svc_id}["{ns}/{svc} (Service)"]')
+            ensure_ns_node(ns, svc_id)
+
+        if (host_id, svc_id) not in edges:
+            edges.add((host_id, svc_id))
+            lines.append(f"  {host_id} --> {svc_id}")
+
+        for w in ep["backend"]["workloads"]:
+            w_id = f"wl_{_sanitize_node_id(ns)}_{_sanitize_node_id(w['name'])}"
+            if w_id not in ns_nodes.get(ns, []):
+                lines.append(f'  {w_id}["{ns}/{w["name"]} ({w["kind"]})"]')
+                ensure_ns_node(ns, w_id)
+            if (svc_id, w_id) not in edges:
+                edges.add((svc_id, w_id))
+                lines.append(f"  {svc_id} --> {w_id}")
+
+    # Wrap namespace subgraphs at the end for stability (sorted namespaces).
+    if ns_nodes:
+        lines.append("")
+        for ns in sorted(ns_nodes.keys()):
+            lines.append(f"  subgraph { _sanitize_node_id(ns) }[{ns}]")
+            for node_id in ns_nodes[ns]:
+                lines.append(f"    {node_id}")
+            lines.append("  end")
+
+    diagram = "\n".join(lines).rstrip() + "\n"
+
+    summary = {
+        "counts": {
+            "workloads": len(workloads),
+            "services": len(services),
+            "http_endpoints": len(endpoints),
+            "helmrelease_host_hints": sum(len(v) for v in helmrelease_hosts.values()),
+        }
+    }
+
+    return catalog, summary, diagram
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", default="knowledge", help="Output base directory (default: knowledge/)")
+    ap.add_argument(
+        "--write",
+        action="store_true",
+        help="Write generated files (otherwise just print a summary).",
+    )
+    args = ap.parse_args()
+
+    out_dir = REPO_ROOT / args.out
+    flux = find_flux_kustomizations()
+    if not flux:
+        print("No Flux Kustomizations found under clusters/atlas/flux-system.", file=sys.stderr)
+        return 2
+
+    rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]] = []
+    for k in flux:
+        path = REPO_ROOT / k.path
+        if not path.exists():
+            continue
+        raw = kustomize_build(path)
+        docs = [d for d in _iter_docs(raw) if d.get("kind") != "Secret"]
+        rendered.append((k, docs))
+
+    rendered = sorted(rendered, key=lambda item: item[0].name)
+    catalog, summary, diagram = extract_catalog(rendered)
+
+    if not args.write:
+        print(json.dumps(summary, indent=2, sort_keys=True))
+        return 0
+
+    (out_dir / "catalog").mkdir(parents=True, exist_ok=True)
+    (out_dir / "diagrams").mkdir(parents=True, exist_ok=True)
+
+    catalog_path = out_dir / "catalog" / "atlas.yaml"
+    catalog_json_path = out_dir / "catalog" / "atlas.json"
+    summary_path = out_dir / "catalog" / "atlas-summary.json"
+    diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
+    runbooks_json_path = out_dir / "catalog" / "runbooks.json"
+
+    catalog_path.write_text(
+        "# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
+        + yaml.safe_dump(catalog, sort_keys=False),
+        encoding="utf-8",
+    )
+    catalog_json_path.write_text(json.dumps(catalog, indent=2, sort_keys=False) + "\n", encoding="utf-8")
+    summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    diagram_path.write_text(diagram, encoding="utf-8")
+
+    # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
+    runbooks_dir = out_dir / "runbooks"
+    runbooks: list[dict[str, Any]] = []
+    if runbooks_dir.exists():
+        for md_file in sorted(runbooks_dir.glob("*.md")):
+            raw = md_file.read_text(encoding="utf-8")
+            fm: dict[str, Any] = {}
+            body = raw
+            if raw.startswith("---\n"):
+                try:
+                    _, rest = raw.split("---\n", 1)
+                    fm_raw, body = rest.split("\n---\n", 1)
+                    fm = yaml.safe_load(fm_raw) or {}
+                except Exception:
+                    fm = {}
+                    body = raw
+            runbooks.append(
+                {
+                    "path": str(md_file.relative_to(out_dir)),
+                    "title": fm.get("title") or md_file.stem,
+                    "tags": fm.get("tags") or [],
+                    "entrypoints": fm.get("entrypoints") or [],
+                    "source_paths": fm.get("source_paths") or [],
+                    "body": body.strip(),
+                }
+            )
+    runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
+
+    print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
+    print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
+    print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
+    print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
+    print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/logging_render_observability.py
+++ b/scripts/logging_render_observability.py
@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""Generate OpenSearch Observability seed objects and render them into ConfigMaps.
+
+Usage:
+  scripts/logging_render_observability.py --build   # rebuild JSON + ConfigMap
+  scripts/logging_render_observability.py           # re-render ConfigMap from JSON
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import textwrap
+from dataclasses import dataclass
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+OBS_DIR = ROOT / "services" / "logging" / "observability"
+APPS_PATH = OBS_DIR / "applications.json"
+QUERIES_PATH = OBS_DIR / "saved_queries.json"
+VIS_PATH = OBS_DIR / "saved_visualizations.json"
+CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-observability-objects.yaml"
+
+CONFIG_TEMPLATE = textwrap.dedent(
+    """# {relative_path}
+# Generated by scripts/logging_render_observability.py --build
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: opensearch-observability-objects
+  namespace: logging
+data:
+  applications.json: |
+{applications}
+  saved_queries.json: |
+{queries}
+  saved_visualizations.json: |
+{visualizations}
+"""
+)
+
+DEFAULT_RANGE = {"start": "now-24h", "end": "now", "text": ""}
+DEFAULT_TIMESTAMP = {"name": "@timestamp", "type": "timestamp"}
+DEFAULT_FIELDS = {"text": "", "tokens": []}
+
+
+@dataclass(frozen=True)
+class AppSpec:
+    name: str
+    base_query: str
+    kind: str = "kube"
+    description: str = ""
+
+
+@dataclass(frozen=True)
+class QuerySpec:
+    name: str
+    query: str
+    description: str = ""
+
+
+@dataclass(frozen=True)
+class VisualizationSpec:
+    name: str
+    query: str
+    vis_type: str
+    description: str = ""
+
+
+def source_query(index: str, where: str | None = None) -> str:
+    query = f"source = {index}"
+    if where:
+        query += f" | where {where}"
+    return query
+
+
+def error_filter(fields: list[str]) -> str:
+    parts = [f"match({field}, 'error|exception|fail')" for field in fields]
+    return " or ".join(parts)
+
+
+def saved_query(spec: QuerySpec) -> dict:
+    return {
+        "name": spec.name,
+        "description": spec.description,
+        "query": spec.query,
+        "selected_date_range": DEFAULT_RANGE,
+        "selected_timestamp": DEFAULT_TIMESTAMP,
+        "selected_fields": DEFAULT_FIELDS,
+    }
+
+
+def saved_visualization(spec: VisualizationSpec) -> dict:
+    return {
+        "name": spec.name,
+        "description": spec.description,
+        "query": spec.query,
+        "type": spec.vis_type,
+        "selected_date_range": DEFAULT_RANGE,
+        "selected_timestamp": DEFAULT_TIMESTAMP,
+        "selected_fields": DEFAULT_FIELDS,
+    }
+
+
+def build_objects() -> tuple[list[dict], list[dict], list[dict]]:
+    kube_error = error_filter(["log", "message"])
+    journald_error = error_filter(["MESSAGE"])
+
+    apps = [
+        AppSpec("bstein-dev-home", source_query("kube-*", "kubernetes.namespace_name = 'bstein-dev-home'")),
+        AppSpec(
+            "pegasus",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'",
+            ),
+        ),
+        AppSpec(
+            "jellyfin",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'",
+            ),
+        ),
+        AppSpec("vaultwarden", source_query("kube-*", "kubernetes.namespace_name = 'vaultwarden'")),
+        AppSpec("mailu", source_query("kube-*", "kubernetes.namespace_name = 'mailu-mailserver'")),
+        AppSpec("nextcloud", source_query("kube-*", "kubernetes.namespace_name = 'nextcloud'")),
+        AppSpec("gitea", source_query("kube-*", "kubernetes.namespace_name = 'gitea'")),
+        AppSpec("jenkins", source_query("kube-*", "kubernetes.namespace_name = 'jenkins'")),
+        AppSpec("harbor", source_query("kube-*", "kubernetes.namespace_name = 'harbor'")),
+        AppSpec("vault", source_query("kube-*", "kubernetes.namespace_name = 'vault'")),
+        AppSpec("keycloak", source_query("kube-*", "kubernetes.namespace_name = 'sso'")),
+        AppSpec("flux-system", source_query("kube-*", "kubernetes.namespace_name = 'flux-system'")),
+        AppSpec("comms", source_query("kube-*", "kubernetes.namespace_name = 'comms'")),
+        AppSpec(
+            "element-web",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'",
+            ),
+        ),
+        AppSpec(
+            "element-call",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'",
+            ),
+        ),
+        AppSpec(
+            "matrix-synapse",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'",
+            ),
+        ),
+        AppSpec(
+            "livekit",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'",
+            ),
+        ),
+        AppSpec(
+            "coturn",
+            source_query(
+                "kube-*",
+                "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'",
+            ),
+        ),
+        AppSpec(
+            "lesavka",
+            source_query("journald-*", "_HOSTNAME = 'titan-jh'"),
+            kind="journald",
+        ),
+    ]
+
+    applications = [
+        {
+            "name": app.name,
+            "description": app.description,
+            "baseQuery": app.base_query,
+            "servicesEntities": [],
+            "traceGroups": [app.name],
+        }
+        for app in apps
+    ]
+
+    queries = [
+        saved_query(QuerySpec("kube logs", source_query("kube-*"))),
+        saved_query(QuerySpec("kube errors", f"{source_query('kube-*')} | where {kube_error}")),
+        saved_query(QuerySpec("journald logs", source_query("journald-*"))),
+        saved_query(QuerySpec("journald errors", f"{source_query('journald-*')} | where {journald_error}")),
+    ]
+
+    for app in apps:
+        query_base = app.base_query
+        error_clause = journald_error if app.kind == "journald" else kube_error
+        queries.append(saved_query(QuerySpec(f"{app.name} logs", query_base)))
+        queries.append(saved_query(QuerySpec(f"{app.name} errors", f"{query_base} | where {error_clause}")))
+
+    visualizations = [
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Logs per hour",
+                "source = kube-* | stats count() as log_count by span(`@timestamp`, 1h)",
+                "line",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Errors per hour",
+                f"source = kube-* | where {kube_error} | stats count() as error_count by span(`@timestamp`, 1h)",
+                "line",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Top namespaces",
+                "source = kube-* | stats count() as log_count by kubernetes.namespace_name | sort - log_count",
+                "bar",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Top error namespaces",
+                f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.namespace_name | sort - error_count",
+                "bar",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Top pods",
+                "source = kube-* | stats count() as log_count by kubernetes.pod_name | sort - log_count",
+                "bar",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Top error pods",
+                f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.pod_name | sort - error_count",
+                "bar",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Kube] Top nodes",
+                "source = kube-* | stats count() as log_count by kubernetes.node_name | sort - log_count",
+                "bar",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Journald] Top units",
+                "source = journald-* | stats count() as log_count by _SYSTEMD_UNIT | sort - log_count",
+                "bar",
+            )
+        ),
+        saved_visualization(
+            VisualizationSpec(
+                "[Journald] Top error units",
+                f"source = journald-* | where {journald_error} | stats count() as error_count by _SYSTEMD_UNIT | sort - error_count",
+                "bar",
+            )
+        ),
+    ]
+
+    return applications, queries, visualizations
+
+
+def write_json(payload: list[dict], path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2) + "\n")
+
+
+def render_configmap(apps_path: Path, queries_path: Path, vis_path: Path, output_path: Path) -> None:
+    relative_path = output_path.relative_to(ROOT)
+    applications = indent_payload(apps_path)
+    queries = indent_payload(queries_path)
+    visualizations = indent_payload(vis_path)
+    output_path.write_text(
+        CONFIG_TEMPLATE.format(
+            relative_path=relative_path,
+            applications=applications,
+            queries=queries,
+            visualizations=visualizations,
+        )
+    )
+
+
+def indent_payload(path: Path) -> str:
+    lines = path.read_text().splitlines()
+    return "\n".join("    " + line for line in lines)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--build", action="store_true", help="Regenerate JSON payloads and ConfigMap")
+    args = parser.parse_args()
+
+    if args.build:
+        applications, queries, visualizations = build_objects()
+        write_json(applications, APPS_PATH)
+        write_json(queries, QUERIES_PATH)
+        write_json(visualizations, VIS_PATH)
+
+    if not (APPS_PATH.exists() and QUERIES_PATH.exists() and VIS_PATH.exists()):
+        raise SystemExit("Missing observability JSON payloads. Run with --build first.")
+
+    render_configmap(APPS_PATH, QUERIES_PATH, VIS_PATH, CONFIG_PATH)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/monitoring_postmark_exporter.py
+++ b/scripts/monitoring_postmark_exporter.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+import datetime as dt
+import os
+import time
+from dataclasses import dataclass
+
+import requests
+from prometheus_client import Gauge, Info, start_http_server
+
+
+@dataclass(frozen=True)
+class Window:
+    label: str
+    days: int
+
+
+WINDOWS = [
+    Window("today", 0),
+    Window("1d", 1),
+    Window("7d", 7),
+    Window("30d", 30),
+]
+
+API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/")
+POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60"))
+LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
+LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000"))
+
+PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip()
+FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip()
+LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip()
+LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip()
+try:
+    SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0
+except ValueError:
+    SENDING_LIMIT = 0.0
+
+EXPORTER_INFO = Info("postmark_exporter", "Exporter build info")
+EXPORTER_INFO.info(
+    {
+        "api_base": API_BASE,
+        "windows": ",".join(window.label for window in WINDOWS),
+    }
+)
+
+POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)")
+POSTMARK_LAST_SUCCESS = Gauge(
+    "postmark_last_success_timestamp_seconds",
+    "Unix timestamp of the last successful Postmark stats refresh",
+)
+POSTMARK_REQUEST_ERRORS = Gauge(
+    "postmark_request_errors_total",
+    "Total Postmark stats request errors since exporter start",
+)
+
+POSTMARK_OUTBOUND_SENT = Gauge(
+    "postmark_outbound_sent",
+    "Outbound emails sent within the selected window",
+    labelnames=("window",),
+)
+POSTMARK_OUTBOUND_BOUNCED = Gauge(
+    "postmark_outbound_bounced",
+    "Outbound emails bounced within the selected window",
+    labelnames=("window",),
+)
+POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge(
+    "postmark_outbound_bounce_rate",
+    "Outbound bounce rate percentage within the selected window",
+    labelnames=("window",),
+)
+POSTMARK_SENDING_LIMIT_GAUGE = Gauge(
+    "postmark_sending_limit",
+    "Configured Postmark sending limit for the active account",
+)
+POSTMARK_SENDING_LIMIT_USED = Gauge(
+    "postmark_sending_limit_used",
+    "Messages sent within the configured send limit window",
+)
+POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge(
+    "postmark_sending_limit_used_percent",
+    "Percent of the configured send limit used within the limit window",
+)
+
+
+def fetch_outbound_stats(token: str, window: Window) -> dict:
+    today = dt.date.today()
+    fromdate = today - dt.timedelta(days=window.days)
+    params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()}
+    headers = {
+        "Accept": "application/json",
+        "X-Postmark-Server-Token": token,
+    }
+    response = requests.get(
+        f"{API_BASE}/stats/outbound",
+        headers=headers,
+        params=params,
+        timeout=15,
+    )
+    response.raise_for_status()
+    return response.json()
+
+
+def update_metrics(token: str) -> None:
+    sent_by_window = {}
+    for window in WINDOWS:
+        data = fetch_outbound_stats(token, window)
+        sent = int(data.get("Sent", 0) or 0)
+        bounced = int(data.get("Bounced", 0) or 0)
+        rate = (bounced / sent * 100.0) if sent else 0.0
+        sent_by_window[window.label] = sent
+        POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent)
+        POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced)
+        POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate)
+
+    POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT)
+    limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0)
+    POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent)
+    if SENDING_LIMIT:
+        POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0)
+    else:
+        POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0)
+
+
+def main() -> None:
+    if not PRIMARY_TOKEN and not FALLBACK_TOKEN:
+        raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required")
+
+    start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS)
+
+    tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token]
+    token_index = 0
+
+    while True:
+        token = tokens[token_index % len(tokens)]
+        token_index += 1
+        try:
+            update_metrics(token)
+            POSTMARK_API_UP.set(1)
+            POSTMARK_LAST_SUCCESS.set(time.time())
+        except Exception as exc:  # noqa: BLE001
+            POSTMARK_API_UP.set(0)
+            POSTMARK_REQUEST_ERRORS.inc()
+            print(f"postmark_exporter: refresh failed: {exc}", flush=True)
+        time.sleep(POLL_INTERVAL_SECONDS)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/monitoring_render_postmark_exporter.py
+++ b/scripts/monitoring_render_postmark_exporter.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+
+
+def indent(text: str, spaces: int) -> str:
+    prefix = " " * spaces
+    return "".join(prefix + line if line.strip("\n") else line for line in text.splitlines(keepends=True))
+
+
+def main() -> None:
+    root = Path(__file__).resolve().parents[1]
+    source = root / "scripts" / "monitoring_postmark_exporter.py"
+    target = root / "services" / "monitoring" / "postmark-exporter-script.yaml"
+
+    payload = source.read_text(encoding="utf-8")
+    if not payload.endswith("\n"):
+        payload += "\n"
+
+    yaml = (
+        f"# services/monitoring/postmark-exporter-script.yaml\n"
+        f"apiVersion: v1\n"
+        f"kind: ConfigMap\n"
+        f"metadata:\n"
+        f"  name: postmark-exporter-script\n"
+        f"data:\n"
+        f"  monitoring_postmark_exporter.py: |\n"
+        f"{indent(payload, 4)}"
+    )
+
+    target.write_text(yaml, encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/nextcloud-mail-sync.sh
+++ b/scripts/nextcloud-mail-sync.sh
@ -1,49 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-KC_BASE="${KC_BASE:?}"
-KC_REALM="${KC_REALM:?}"
-KC_ADMIN_USER="${KC_ADMIN_USER:?}"
-KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
-
-if ! command -v jq >/dev/null 2>&1; then
-  apt-get update && apt-get install -y jq curl >/dev/null
-fi
-
-account_exists() {
-  # Skip if the account email is already present in the mail app.
-  runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
-    runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
-}
-
-token=$(
-  curl -s -d "grant_type=password" \
-    -d "client_id=admin-cli" \
-    -d "username=${KC_ADMIN_USER}" \
-    -d "password=${KC_ADMIN_PASS}" \
-    "${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
-)
-
-if [[ -z "${token}" || "${token}" == "null" ]]; then
-  echo "Failed to obtain admin token"
-  exit 1
-fi
-
-users=$(curl -s -H "Authorization: Bearer ${token}" \
-  "${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
-
-echo "${users}" | jq -c '.[]' | while read -r user; do
-  username=$(echo "${user}" | jq -r '.username')
-  email=$(echo "${user}" | jq -r '.email // empty')
-  app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
-  [[ -z "${email}" || -z "${app_pw}" ]] && continue
-  if account_exists "${email}"; then
-    echo "Skipping ${email}, already exists"
-    continue
-  fi
-  echo "Syncing ${email}"
-  runuser -u www-data -- php occ mail:account:create \
-    "${username}" "${username}" "${email}" \
-    mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
-    mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
-done
--- a/scripts/nextcloud-maintenance.sh
+++ b/scripts/nextcloud-maintenance.sh
@ -1,65 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-NC_URL="${NC_URL:-https://cloud.bstein.dev}"
-ADMIN_USER="${ADMIN_USER:?}"
-ADMIN_PASS="${ADMIN_PASS:?}"
-
-export DEBIAN_FRONTEND=noninteractive
-apt-get update -qq
-apt-get install -y -qq curl jq >/dev/null
-
-run_occ() {
-  runuser -u www-data -- php occ "$@"
-}
-
-log() { echo "[$(date -Is)] $*"; }
-
-log "Applying Atlas theming"
-run_occ theming:config name "Atlas Cloud"
-run_occ theming:config slogan "Unified access to Atlas services"
-run_occ theming:config url "https://cloud.bstein.dev"
-run_occ theming:config color "#0f172a"
-run_occ theming:config disable-user-theming yes
-
-log "Setting default quota to 200 GB"
-run_occ config:app:set files default_quota --value "200 GB"
-
-API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
-AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
-
-log "Removing existing external links"
-existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
-for id in ${existing}; do
-  curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
-done
-
-SITES=(
-  "Vaultwarden|https://vault.bstein.dev"
-  "Jellyfin|https://stream.bstein.dev"
-  "Gitea|https://scm.bstein.dev"
-  "Jenkins|https://ci.bstein.dev"
-  "Harbor|https://registry.bstein.dev"
-  "Vault|https://secret.bstein.dev"
-  "Jitsi|https://meet.bstein.dev"
-  "Grafana|https://metrics.bstein.dev"
-  "Chat LLM|https://chat.ai.bstein.dev"
-  "Vision|https://draw.ai.bstein.dev"
-  "STT/TTS|https://talk.ai.bstein.dev"
-)
-
-log "Seeding external links"
-for entry in "${SITES[@]}"; do
-  IFS="|" read -r name url <<<"${entry}"
-  curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
-    -d "name=${name}" \
-    -d "url=${url}" \
-    -d "lang=" \
-    -d "type=link" \
-    -d "device=" \
-    -d "icon=" \
-    -d "groups[]=" \
-    -d "redirect=1" >/dev/null
-done
-
-log "Maintenance run completed"
--- a/scripts/test_atlas_user_cleanup.py
+++ b/scripts/test_atlas_user_cleanup.py
@ -0,0 +1,509 @@
+#!/usr/bin/env python3
+"""Clean up Atlas test users and portal requests (manual-only).
+
+Default behavior is DRY RUN. This script is intended for operators to clean up
+test accounts created via the bstein-dev-home onboarding portal.
+
+Targets (best-effort):
+  - Keycloak users in realm "atlas"
+  - Atlas portal Postgres rows (access_requests + dependent tables)
+  - Vaultwarden users/invites created by the portal
+
+Safety:
+  - Requires an explicit username prefix (e.g. "test-")
+  - Dry-run unless --apply is set
+  - --apply requires an explicit --confirm guard
+  - Validates prefixes to a conservative charset
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import urllib.parse
+import urllib.request
+from dataclasses import dataclass
+from typing import Any, Iterable
+
+
+_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
+
+
+@dataclass(frozen=True)
+class KeycloakUser:
+    user_id: str
+    username: str
+    email: str
+
+
+@dataclass(frozen=True)
+class PortalRequestRow:
+    request_code: str
+    username: str
+    status: str
+
+
+@dataclass(frozen=True)
+class VaultwardenUser:
+    user_id: str
+    email: str
+    status: int
+
+
+def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
+    proc = subprocess.run(
+        cmd,
+        input=input_bytes,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=False,
+    )
+    if proc.returncode != 0:
+        stderr = proc.stderr.decode("utf-8", errors="replace").strip()
+        raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
+    return proc.stdout.decode("utf-8", errors="replace")
+
+
+def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
+    raw_b64 = _run(
+        [
+            "kubectl",
+            "-n",
+            namespace,
+            "get",
+            "secret",
+            name,
+            "-o",
+            f"jsonpath={{.data.{key}}}",
+        ]
+    ).strip()
+    if not raw_b64:
+        raise RuntimeError(f"secret {namespace}/{name} key {key} is empty")
+    return base64.b64decode(raw_b64).decode("utf-8").strip()
+
+
+def _kubectl_first_pod(namespace: str) -> str:
+    raw = _run(
+        [
+            "kubectl",
+            "-n",
+            namespace,
+            "get",
+            "pods",
+            "-o",
+            "json",
+        ]
+    )
+    data = json.loads(raw)
+    items = data.get("items") or []
+    if not isinstance(items, list) or not items:
+        raise RuntimeError(f"no pods found in namespace {namespace}")
+    pod_name = items[0].get("metadata", {}).get("name")
+    if not isinstance(pod_name, str) or not pod_name:
+        raise RuntimeError(f"unexpected pod list in namespace {namespace}")
+    return pod_name
+
+
+def _validate_prefixes(prefixes: list[str]) -> list[str]:
+    cleaned: list[str] = []
+    for prefix in prefixes:
+        prefix = prefix.strip()
+        if not prefix:
+            continue
+        if not _SAFE_PREFIX_RE.match(prefix):
+            raise SystemExit(
+                f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
+            )
+        cleaned.append(prefix)
+    if not cleaned:
+        raise SystemExit("at least one --prefix is required")
+    return cleaned
+
+
+def _starts_with_any(value: str, prefixes: Iterable[str]) -> bool:
+    return any(value.startswith(p) for p in prefixes)
+
+
+def _keycloak_token(server: str, realm: str, client_id: str, client_secret: str) -> str:
+    data = urllib.parse.urlencode(
+        {
+            "grant_type": "client_credentials",
+            "client_id": client_id,
+            "client_secret": client_secret,
+        }
+    ).encode("utf-8")
+    req = urllib.request.Request(
+        f"{server}/realms/{realm}/protocol/openid-connect/token",
+        data=data,
+        method="POST",
+    )
+    req.add_header("Content-Type", "application/x-www-form-urlencoded")
+    with urllib.request.urlopen(req, timeout=15) as resp:
+        payload = json.loads(resp.read().decode("utf-8"))
+    token = payload.get("access_token")
+    if not isinstance(token, str) or not token:
+        raise RuntimeError("failed to obtain keycloak access token")
+    return token
+
+
+def _keycloak_list_users(server: str, realm: str, token: str, search: str) -> list[KeycloakUser]:
+    query = urllib.parse.urlencode({"max": "1000", "search": search})
+    req = urllib.request.Request(f"{server}/admin/realms/{realm}/users?{query}", method="GET")
+    req.add_header("Authorization", f"Bearer {token}")
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        payload = json.loads(resp.read().decode("utf-8"))
+    if not isinstance(payload, list):
+        raise RuntimeError("unexpected keycloak users response")
+    users: list[KeycloakUser] = []
+    for item in payload:
+        if not isinstance(item, dict):
+            continue
+        user_id = item.get("id")
+        username = item.get("username") or ""
+        email = item.get("email") or ""
+        if not isinstance(user_id, str) or not user_id:
+            continue
+        if not isinstance(username, str):
+            continue
+        users.append(KeycloakUser(user_id=user_id, username=username, email=str(email)))
+    return users
+
+
+def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) -> None:
+    req = urllib.request.Request(f"{server}/admin/realms/{realm}/users/{user_id}", method="DELETE")
+    req.add_header("Authorization", f"Bearer {token}")
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            _ = resp.read()
+    except urllib.error.HTTPError as exc:
+        if exc.code == 404:
+            return
+        raise
+
+
+def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
+    postgres_pod = _kubectl_first_pod("postgres")
+    out = _run(
+        [
+            "kubectl",
+            "-n",
+            "postgres",
+            "exec",
+            "-i",
+            postgres_pod,
+            "--",
+            "psql",
+            portal_db_url,
+            "-At",
+            "-F",
+            "\t",
+            "-c",
+            sql,
+        ]
+    )
+    rows: list[dict[str, Any]] = []
+    for line in out.splitlines():
+        parts = line.split("\t")
+        rows.append({"cols": parts})
+    return rows
+
+
+def _portal_list_requests(portal_db_url: str, prefixes: list[str]) -> list[PortalRequestRow]:
+    clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
+    sql = (
+        "SELECT request_code, username, status "
+        "FROM access_requests "
+        f"WHERE {clauses} "
+        "ORDER BY created_at DESC;"
+    )
+    raw_rows = _psql_json(portal_db_url, sql)
+    parsed: list[PortalRequestRow] = []
+    for row in raw_rows:
+        cols = row.get("cols") or []
+        if len(cols) < 3:
+            continue
+        parsed.append(PortalRequestRow(request_code=cols[0], username=cols[1], status=cols[2]))
+    return parsed
+
+
+def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
+    clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
+    sql = f"DELETE FROM access_requests WHERE {clauses};"
+    postgres_pod = _kubectl_first_pod("postgres")
+    out = _run(
+        [
+            "kubectl",
+            "-n",
+            "postgres",
+            "exec",
+            "-i",
+            postgres_pod,
+            "--",
+            "psql",
+            portal_db_url,
+            "-c",
+            sql,
+        ]
+    )
+    # psql prints "DELETE <n>"
+    match = re.search(r"DELETE\\s+(\\d+)", out)
+    return int(match.group(1)) if match else 0
+
+
+def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
+    data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
+    req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
+    req.add_header("Content-Type", "application/x-www-form-urlencoded")
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            set_cookie = resp.headers.get("Set-Cookie") or ""
+    except urllib.error.HTTPError as exc:
+        if exc.code == 429:
+            raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
+        raise
+    cookie = set_cookie.split(";", 1)[0].strip()
+    if not cookie:
+        raise RuntimeError("vaultwarden admin cookie missing")
+    return cookie
+
+
+def _vaultwarden_list_users(base_url: str, cookie: str) -> list[VaultwardenUser]:
+    req = urllib.request.Request(f"{base_url}/admin/users", method="GET")
+    req.add_header("Cookie", cookie)
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            payload = json.loads(resp.read().decode("utf-8"))
+    except urllib.error.HTTPError as exc:
+        if exc.code == 429:
+            raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
+        raise
+    if not isinstance(payload, list):
+        raise RuntimeError("unexpected vaultwarden /admin/users response")
+    users: list[VaultwardenUser] = []
+    for item in payload:
+        if not isinstance(item, dict):
+            continue
+        user_id = item.get("id")
+        email = item.get("email")
+        status = item.get("_status")
+        if not isinstance(user_id, str) or not user_id:
+            continue
+        if not isinstance(email, str) or not email:
+            continue
+        if not isinstance(status, int):
+            status = -1
+        users.append(VaultwardenUser(user_id=user_id, email=email, status=status))
+    return users
+
+
+def _vaultwarden_delete_user(base_url: str, cookie: str, user_id: str) -> None:
+    req = urllib.request.Request(f"{base_url}/admin/users/{user_id}", method="DELETE")
+    req.add_header("Cookie", cookie)
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            _ = resp.read()
+    except urllib.error.HTTPError as exc:
+        if exc.code in {404}:
+            return
+        if exc.code == 429:
+            raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
+        raise
+
+
+def _port_forward(namespace: str, target: str, local_port: int, remote_port: int) -> subprocess.Popen[bytes]:
+    # Keep stdout/stderr muted to avoid leaking internal details in output.
+    return subprocess.Popen(
+        [
+            "kubectl",
+            "-n",
+            namespace,
+            "port-forward",
+            target,
+            f"{local_port}:{remote_port}",
+            "--address",
+            "127.0.0.1",
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--prefix",
+        action="append",
+        default=[],
+        help="Username prefix to match (repeatable). Example: --prefix test-",
+    )
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="Actually delete; otherwise dry-run only.",
+    )
+    parser.add_argument(
+        "--confirm",
+        default="",
+        help=(
+            "Required when using --apply. Must exactly equal the comma-separated "
+            "sorted prefix list (e.g. 'atlas-,bob-,e2e-,test-')."
+        ),
+    )
+    parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
+    parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
+    parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
+    parser.add_argument(
+        "--protect-keycloak-username",
+        action="append",
+        default=[],
+        help="Keycloak usernames that must never be deleted (repeatable).",
+    )
+    parser.add_argument(
+        "--protect-vaultwarden-email",
+        action="append",
+        default=[],
+        help="Vaultwarden emails that must never be deleted (repeatable).",
+    )
+    args = parser.parse_args()
+
+    prefixes = sorted(set(_validate_prefixes(args.prefix)))
+    apply = bool(args.apply)
+    expected_confirm = ",".join(prefixes)
+    protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
+    protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
+
+    if apply and args.confirm != expected_confirm:
+        raise SystemExit(
+            f"refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')"
+        )
+
+    print("Atlas test-user cleanup")
+    print("prefixes:", expected_confirm)
+    print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
+    if protected_keycloak:
+        print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
+    if protected_vaultwarden:
+        print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
+    print()
+
+    if not args.skip_portal_db:
+        portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
+        requests = _portal_list_requests(portal_db_url, prefixes)
+        print(f"Portal DB: {len(requests)} access_requests matched")
+        for row in requests[:50]:
+            print(f"  {row.request_code}\t{row.status}\t{row.username}")
+        if len(requests) > 50:
+            print(f"  ... and {len(requests) - 50} more")
+        if apply and requests:
+            deleted = _portal_delete_requests(portal_db_url, prefixes)
+            print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
+        print()
+
+    if not args.skip_keycloak:
+        kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
+        kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
+        kc_client_id = os.getenv("KEYCLOAK_ADMIN_CLIENT_ID", "bstein-dev-home-admin")
+        kc_client_secret = _kubectl_get_secret_value(
+            "bstein-dev-home", "bstein-dev-home-keycloak-admin", "client_secret"
+        )
+        token = _keycloak_token(kc_server, kc_realm, kc_client_id, kc_client_secret)
+        found: dict[str, KeycloakUser] = {}
+        for prefix in prefixes:
+            for user in _keycloak_list_users(kc_server, kc_realm, token, prefix):
+                if not _starts_with_any(user.username, prefixes):
+                    continue
+                if user.username in protected_keycloak:
+                    continue
+                found[user.user_id] = user
+        users = list(found.values())
+        users.sort(key=lambda u: u.username)
+        print(f"Keycloak: {len(users)} users matched")
+        for user in users[:50]:
+            email = user.email or "-"
+            print(f"  {user.username}\t{email}\t{user.user_id}")
+        if len(users) > 50:
+            print(f"  ... and {len(users) - 50} more")
+        if apply and users:
+            for user in users:
+                _keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
+            print(f"Keycloak: deleted {len(users)} users.")
+        print()
+
+    if not args.skip_vaultwarden:
+        pf = _port_forward("vaultwarden", "svc/vaultwarden-service", 18081, 80)
+        try:
+            # wait briefly for the port-forward to come up
+            for _ in range(30):
+                try:
+                    urllib.request.urlopen("http://127.0.0.1:18081/", timeout=1).read(1)
+                    break
+                except Exception:
+                    time.sleep(0.2)
+
+            admin_token = _kubectl_get_secret_value("vaultwarden", "vaultwarden-admin", "ADMIN_TOKEN")
+            base_url = "http://127.0.0.1:18081"
+            try:
+                cookie = ""
+                for attempt in range(7):
+                    try:
+                        cookie = _vaultwarden_admin_cookie(admin_token, base_url)
+                        break
+                    except RuntimeError as exc:
+                        if "rate limited" in str(exc).lower():
+                            time.sleep(min(60.0, 2.0**attempt))
+                            continue
+                        raise
+                if not cookie:
+                    raise RuntimeError("vaultwarden admin login repeatedly rate limited")
+
+                users: list[VaultwardenUser] = []
+                for attempt in range(7):
+                    try:
+                        users = _vaultwarden_list_users(base_url, cookie)
+                        break
+                    except RuntimeError as exc:
+                        if "rate limited" in str(exc).lower():
+                            time.sleep(min(60.0, 2.0**attempt))
+                            continue
+                        raise
+                if not users:
+                    raise RuntimeError("vaultwarden user list unavailable (possibly rate limited)")
+            except RuntimeError as exc:
+                print(f"Vaultwarden: ERROR: {exc}")
+                print()
+                return 1
+            matched: list[VaultwardenUser] = []
+            for user in users:
+                local = user.email.split("@", 1)[0]
+                if _starts_with_any(local, prefixes):
+                    if user.email in protected_vaultwarden:
+                        continue
+                    matched.append(user)
+            matched.sort(key=lambda u: u.email)
+            print(f"Vaultwarden: {len(matched)} users matched")
+            for user in matched[:50]:
+                print(f"  {user.email}\tstatus={user.status}\t{user.user_id}")
+            if len(matched) > 50:
+                print(f"  ... and {len(matched) - 50} more")
+            if apply and matched:
+                for user in matched:
+                    _vaultwarden_delete_user(base_url, cookie, user.user_id)
+                print(f"Vaultwarden: deleted {len(matched)} users.")
+            print()
+        finally:
+            pf.terminate()
+            try:
+                pf.wait(timeout=3)
+            except Exception:
+                pf.kill()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/test_user_cleanup.py
+++ b/scripts/test_user_cleanup.py
@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Iterable
+from urllib.parse import quote
+
+import httpx
+
+from atlas_portal import db, settings
+from atlas_portal.keycloak import admin_client
+
+
+@dataclass(frozen=True)
+class KeycloakUser:
+    id: str
+    username: str
+
+
+@dataclass(frozen=True)
+class PortalRequest:
+    request_code: str
+    username: str
+    status: str
+
+
+def _dedupe_by_id(users: Iterable[KeycloakUser]) -> list[KeycloakUser]:
+    seen: set[str] = set()
+    out: list[KeycloakUser] = []
+    for user in users:
+        if user.id in seen:
+            continue
+        seen.add(user.id)
+        out.append(user)
+    return out
+
+
+def _iter_keycloak_users_for_prefix(prefix: str, max_results: int) -> list[KeycloakUser]:
+    client = admin_client()
+    if not client.ready():
+        raise RuntimeError("keycloak admin client not configured in this environment")
+
+    url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
+    # Keycloak can return false positives for search; we do a strict prefix match client-side.
+    params = {"search": prefix, "max": str(max_results), "briefRepresentation": "true"}
+    with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
+        resp = http.get(url, params=params, headers=client.headers())
+        resp.raise_for_status()
+        payload = resp.json()
+
+    if not isinstance(payload, list):
+        return []
+
+    found: list[KeycloakUser] = []
+    for item in payload:
+        if not isinstance(item, dict):
+            continue
+        username = item.get("username")
+        user_id = item.get("id")
+        if not isinstance(username, str) or not isinstance(user_id, str):
+            continue
+        if not username.startswith(prefix):
+            continue
+        if username.startswith("service-account-"):
+            continue
+        found.append(KeycloakUser(id=user_id, username=username))
+    return found
+
+
+def _find_keycloak_users(prefixes: list[str], max_results: int, protected: set[str]) -> list[KeycloakUser]:
+    matches: list[KeycloakUser] = []
+    for prefix in prefixes:
+        matches.extend(_iter_keycloak_users_for_prefix(prefix, max_results=max_results))
+
+    deduped = _dedupe_by_id(matches)
+    return [user for user in deduped if user.username not in protected]
+
+
+def _delete_keycloak_users(users: list[KeycloakUser]) -> None:
+    if not users:
+        return
+
+    client = admin_client()
+    if not client.ready():
+        raise RuntimeError("keycloak admin client not configured in this environment")
+
+    base = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
+    with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
+        for user in users:
+            url = f"{base}/{quote(user.id, safe='')}"
+            resp = http.delete(url, headers=client.headers())
+            # Deleting a non-existent user is treated as success for idempotency.
+            if resp.status_code == 404:
+                continue
+            resp.raise_for_status()
+
+
+def _find_portal_requests(prefixes: list[str], max_results: int) -> list[PortalRequest]:
+    if not db.configured():
+        return []
+
+    like_prefixes = [f"{prefix}%" for prefix in prefixes]
+    rows: list[dict[str, Any]] = []
+    with db.connect() as conn:
+        for like in like_prefixes:
+            cursor = conn.execute(
+                """
+                SELECT request_code, username, status
+                FROM access_requests
+                WHERE username LIKE %s
+                ORDER BY created_at DESC
+                LIMIT %s
+                """,
+                (like, max_results),
+            )
+            batch = cursor.fetchall()
+            if isinstance(batch, list):
+                rows.extend([r for r in batch if isinstance(r, dict)])
+
+    out: list[PortalRequest] = []
+    for row in rows:
+        request_code = row.get("request_code")
+        username = row.get("username")
+        status = row.get("status")
+        if not isinstance(request_code, str) or not isinstance(username, str) or not isinstance(status, str):
+            continue
+        out.append(PortalRequest(request_code=request_code, username=username, status=status))
+    return out
+
+
+def _delete_portal_requests(prefixes: list[str]) -> int:
+    if not db.configured():
+        return 0
+
+    like_prefixes = [f"{prefix}%" for prefix in prefixes]
+    deleted = 0
+    with db.connect() as conn:
+        for like in like_prefixes:
+            cursor = conn.execute("DELETE FROM access_requests WHERE username LIKE %s", (like,))
+            deleted += cursor.rowcount or 0
+    return deleted
+
+
+def _summarize_portal_requests(rows: list[PortalRequest]) -> dict[str, int]:
+    counts: dict[str, int] = defaultdict(int)
+    for row in rows:
+        counts[row.status] += 1
+    return dict(counts)
+
+
+def _parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="test_user_cleanup",
+        description=(
+            "Manual-only cleanup for test users/requests. "
+            "This script is intended to be run inside the bstein-dev-home backend container."
+        ),
+    )
+    parser.add_argument(
+        "--prefix",
+        action="append",
+        required=True,
+        help="Username prefix to target (repeatable). Example: --prefix test-",
+    )
+    parser.add_argument(
+        "--max",
+        type=int,
+        default=500,
+        help="Maximum users/requests to enumerate per prefix (default: 500).",
+    )
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="Apply deletions (default is dry-run). Requires --confirm.",
+    )
+    parser.add_argument(
+        "--confirm",
+        default="",
+        help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
+    )
+    parser.add_argument(
+        "--skip-keycloak",
+        action="store_true",
+        help="Skip deleting Keycloak users.",
+    )
+    parser.add_argument(
+        "--skip-portal",
+        action="store_true",
+        help="Skip deleting portal (DB) access requests.",
+    )
+    parser.add_argument(
+        "--protect",
+        action="append",
+        default=[],
+        help="Extra usernames to never delete (repeatable).",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="List matched usernames/request codes.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str]) -> int:
+    args = _parse_args(argv)
+    prefixes = sorted({p.strip() for p in args.prefix if p.strip()})
+    if not prefixes:
+        print("error: no valid --prefix values provided", file=sys.stderr)
+        return 2
+
+    expected_confirm = ",".join(prefixes)
+    protected = {"bstein", "robotuser", *[p.strip() for p in args.protect if p.strip()]}
+
+    if args.apply and args.confirm != expected_confirm:
+        print(
+            f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
+            file=sys.stderr,
+        )
+        return 2
+
+    keycloak_users: list[KeycloakUser] = []
+    portal_requests: list[PortalRequest] = []
+
+    if not args.skip_keycloak:
+        keycloak_users = _find_keycloak_users(prefixes, max_results=args.max, protected=protected)
+
+    if not args.skip_portal:
+        portal_requests = _find_portal_requests(prefixes, max_results=args.max)
+
+    print(f"prefixes: {expected_confirm}")
+    print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
+    if protected:
+        print(f"protected usernames: {', '.join(sorted(protected))}")
+
+    if not args.skip_keycloak:
+        print(f"keycloak users matched: {len(keycloak_users)}")
+        if args.verbose and keycloak_users:
+            for user in sorted(keycloak_users, key=lambda u: u.username):
+                print(f"  - {user.username}")
+
+    if not args.skip_portal:
+        print(f"portal requests matched: {len(portal_requests)}")
+        if portal_requests:
+            summary = _summarize_portal_requests(portal_requests)
+            summary_str = ", ".join(f"{k}={v}" for k, v in sorted(summary.items()))
+            print(f"  statuses: {summary_str}")
+        if args.verbose and portal_requests:
+            for req in portal_requests[: min(50, len(portal_requests))]:
+                print(f"  - {req.request_code} ({req.status})")
+            if len(portal_requests) > 50:
+                print(f"  ... and {len(portal_requests) - 50} more")
+
+    if not args.apply:
+        print("dry-run complete (no changes made)")
+        return 0
+
+    if not args.skip_portal:
+        deleted = _delete_portal_requests(prefixes)
+        print(f"deleted portal requests: {deleted}")
+
+    if not args.skip_keycloak:
+        _delete_keycloak_users(keycloak_users)
+        print(f"deleted keycloak users: {len(keycloak_users)}")
+
+    print("done")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
+
--- a/scripts/test_user_cleanup.sh
+++ b/scripts/test_user_cleanup.sh
@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Manual-only helper to run `scripts/test_user_cleanup.py` inside the portal backend container.
+#
+# Usage (dry-run):
+#   scripts/test_user_cleanup.sh --prefix test-
+#
+# Usage (apply):
+#   scripts/test_user_cleanup.sh --prefix test- --apply --confirm test-
+
+NS="${PORTAL_NAMESPACE:-bstein-dev-home}"
+TARGET="${PORTAL_BACKEND_EXEC_TARGET:-deploy/bstein-dev-home-backend}"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+
+cat "${SCRIPT_DIR}/test_user_cleanup.py" | kubectl -n "${NS}" exec -i "${TARGET}" -- python - "$@"
+
--- a/scripts/test_vaultwarden_user_cleanup.py
+++ b/scripts/test_vaultwarden_user_cleanup.py
@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+"""Clean up Vaultwarden test users and invites (manual-only).
+
+This script deletes Vaultwarden rows directly from the Postgres database. It is
+intended only for removing test fallout (e.g. e2e-*, test-*) and is deliberately
+conservative:
+
+- Requires one or more explicit email prefixes (repeatable).
+- Dry-run by default; --apply requires an exact --confirm guard.
+- Refuses to delete any user with dependent data in Vaultwarden tables.
+- Supports a protected email allowlist to prevent catastrophic mistakes.
+
+Example (dry-run):
+  scripts/test_vaultwarden_user_cleanup.py --prefix e2e-
+
+Example (apply):
+  scripts/test_vaultwarden_user_cleanup.py --prefix e2e- --apply --confirm e2e-
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from typing import Iterable, Sequence
+
+
+_SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
+_UUID_RE = re.compile(r"^[0-9a-fA-F-]{32,36}$")
+
+
+@dataclass(frozen=True)
+class VaultwardenUser:
+    uuid: str
+    email: str
+    dependent_rows: int
+
+
+def _run(cmd: Sequence[str], *, input_bytes: bytes | None = None) -> str:
+    proc = subprocess.run(
+        list(cmd),
+        input=input_bytes,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=False,
+    )
+    if proc.returncode != 0:
+        stderr = proc.stderr.decode("utf-8", errors="replace").strip()
+        raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
+    return proc.stdout.decode("utf-8", errors="replace")
+
+
+def _kubectl_first_pod(namespace: str) -> str:
+    raw = _run(["kubectl", "-n", namespace, "get", "pods", "-o", "json"])
+    data = json.loads(raw)
+    items = data.get("items") or []
+    if not isinstance(items, list) or not items:
+        raise RuntimeError(f"no pods found in namespace {namespace}")
+    name = items[0].get("metadata", {}).get("name")
+    if not isinstance(name, str) or not name:
+        raise RuntimeError(f"unexpected pod list in namespace {namespace}")
+    return name
+
+
+def _psql(sql: str) -> str:
+    pod = _kubectl_first_pod("postgres")
+    return _run(
+        [
+            "kubectl",
+            "-n",
+            "postgres",
+            "exec",
+            "-i",
+            pod,
+            "--",
+            "psql",
+            "-U",
+            "postgres",
+            "-d",
+            "vaultwarden",
+            "-At",
+            "-F",
+            "\t",
+            "-c",
+            sql,
+        ]
+    )
+
+
+def _validate_prefixes(prefixes: Iterable[str]) -> list[str]:
+    cleaned: list[str] = []
+    for prefix in prefixes:
+        prefix = prefix.strip()
+        if not prefix:
+            continue
+        if not _SAFE_PREFIX_RE.match(prefix):
+            raise SystemExit(
+                f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
+            )
+        if not prefix.endswith("-"):
+            raise SystemExit(f"refusing prefix '{prefix}': must end with '-' for safety")
+        cleaned.append(prefix)
+    if not cleaned:
+        raise SystemExit("at least one --prefix is required")
+    return sorted(set(cleaned))
+
+
+def _parse_rows(tsv: str) -> list[list[str]]:
+    rows: list[list[str]] = []
+    for line in tsv.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        rows.append(line.split("\t"))
+    return rows
+
+
+def _sql_or_email_prefixes(prefixes: list[str]) -> str:
+    # prefixes validated to safe charset; safe to interpolate.
+    clauses = [f"email LIKE '{p}%'" for p in prefixes]
+    return " OR ".join(clauses) if clauses else "FALSE"
+
+
+def _sql_quote(value: str) -> str:
+    return "'" + value.replace("'", "''") + "'"
+
+
+def _sql_text_array(values: Iterable[str]) -> str:
+    items = ",".join(_sql_quote(v) for v in values)
+    return f"ARRAY[{items}]::text[]"
+
+
+def _list_users(prefixes: list[str], protected: set[str]) -> list[VaultwardenUser]:
+    clause = _sql_or_email_prefixes(prefixes)
+    sql = f"""
+    WITH candidates AS (
+      SELECT uuid, email
+      FROM users
+      WHERE enabled
+        AND ({clause})
+        AND email <> ALL({_sql_text_array(sorted(protected))})
+    )
+    SELECT
+      candidates.uuid,
+      candidates.email,
+      (
+        (SELECT COUNT(*) FROM auth_requests WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM ciphers WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM devices WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM emergency_access WHERE grantor_uuid = candidates.uuid OR grantee_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM favorites WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM folders WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM sends WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM twofactor WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM twofactor_incomplete WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM users_collections WHERE user_uuid = candidates.uuid) +
+        (SELECT COUNT(*) FROM users_organizations WHERE user_uuid = candidates.uuid)
+      ) AS dependent_rows
+    FROM candidates
+    ORDER BY candidates.email;
+    """
+    out = _psql(sql)
+    users: list[VaultwardenUser] = []
+    for row in _parse_rows(out):
+        if len(row) < 3:
+            continue
+        uuid, email, dep_raw = row[0].strip(), row[1].strip(), row[2].strip()
+        if not uuid or not email:
+            continue
+        if not _UUID_RE.match(uuid):
+            continue
+        try:
+            dep = int(dep_raw)
+        except ValueError:
+            dep = 0
+        users.append(VaultwardenUser(uuid=uuid, email=email, dependent_rows=dep))
+    return users
+
+
+def _list_invitations(prefixes: list[str], protected: set[str]) -> list[str]:
+    clause = _sql_or_email_prefixes(prefixes)
+    protected_clause = ""
+    if protected:
+        protected_clause = f"AND email <> ALL({_sql_text_array(sorted(protected))})"
+    sql = f"SELECT email FROM invitations WHERE ({clause}) {protected_clause} ORDER BY email;"
+    out = _psql(sql)
+    invites: list[str] = []
+    for row in _parse_rows(out):
+        if not row:
+            continue
+        email = row[0].strip()
+        if email:
+            invites.append(email)
+    return invites
+
+
+def _delete_invitations(emails: list[str]) -> int:
+    if not emails:
+        return 0
+    email_list = ",".join(_sql_quote(e) for e in emails)
+    sql = f"DELETE FROM invitations WHERE email IN ({email_list});"
+    out = _psql(sql)
+    match = re.search(r"DELETE\s+(\d+)", out)
+    return int(match.group(1)) if match else 0
+
+
+def _delete_users(uuids: list[str]) -> int:
+    if not uuids:
+        return 0
+    uuid_list = ",".join(_sql_quote(u) for u in uuids)
+    sql = f"DELETE FROM users WHERE uuid IN ({uuid_list});"
+    out = _psql(sql)
+    match = re.search(r"DELETE\s+(\d+)", out)
+    return int(match.group(1)) if match else 0
+
+
+def _parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="test_vaultwarden_user_cleanup",
+        description="Manual-only cleanup for Vaultwarden test users/invites (DB-level).",
+    )
+    parser.add_argument(
+        "--prefix",
+        action="append",
+        required=True,
+        help="Email prefix to target (repeatable). Example: --prefix e2e-",
+    )
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="Apply deletions (default is dry-run). Requires --confirm.",
+    )
+    parser.add_argument(
+        "--confirm",
+        default="",
+        help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
+    )
+    parser.add_argument(
+        "--protect-email",
+        action="append",
+        default=[],
+        help="Vaultwarden emails that must never be deleted (repeatable).",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="List matched emails (and invitation emails).",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str]) -> int:
+    args = _parse_args(argv)
+    prefixes = _validate_prefixes(args.prefix)
+    expected_confirm = ",".join(prefixes)
+
+    protected = {e.strip() for e in args.protect_email if e.strip()}
+    protected |= {
+        "brad@bstein.dev",
+        "edstein87@outlook.com",
+        "indifox8@gmail.com",
+        "mgs.stein@gmail.com",
+        "patriot87@gmail.com",
+    }
+
+    if args.apply and args.confirm != expected_confirm:
+        print(
+            f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
+            file=sys.stderr,
+        )
+        return 2
+
+    users = _list_users(prefixes, protected=protected)
+    invites = _list_invitations(prefixes, protected=protected)
+
+    print(f"prefixes: {expected_confirm}")
+    print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
+    if protected:
+        print(f"protected emails: {', '.join(sorted(protected))}")
+    print(f"vaultwarden users matched: {len(users)}")
+    print(f"vaultwarden invitations matched: {len(invites)}")
+
+    if args.verbose:
+        for user in users[: min(100, len(users))]:
+            print(f"  user: {user.email} (deps={user.dependent_rows})")
+        if len(users) > 100:
+            print(f"  ... and {len(users) - 100} more users")
+        for email in invites[: min(100, len(invites))]:
+            print(f"  invite: {email}")
+        if len(invites) > 100:
+            print(f"  ... and {len(invites) - 100} more invitations")
+
+    unsafe = [u for u in users if u.dependent_rows > 0]
+    if unsafe:
+        print("refusing to delete users with dependent data:", file=sys.stderr)
+        for user in unsafe[: min(50, len(unsafe))]:
+            print(f"  - {user.email} deps={user.dependent_rows}", file=sys.stderr)
+        if len(unsafe) > 50:
+            print(f"  ... and {len(unsafe) - 50} more", file=sys.stderr)
+        return 2
+
+    if not args.apply:
+        print("dry-run complete (no changes made)")
+        return 0
+
+    deleted_invites = _delete_invitations(invites)
+    deleted_users = _delete_users([u.uuid for u in users])
+    print(f"deleted vaultwarden invitations: {deleted_invites}")
+    print(f"deleted vaultwarden users: {deleted_users}")
+    print("done")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
--- a/scripts/test_vaultwarden_user_cleanup.sh
+++ b/scripts/test_vaultwarden_user_cleanup.sh
@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Manual-only helper to clean Vaultwarden test users and invites from Postgres.
+#
+# Usage (dry-run):
+#   scripts/test_vaultwarden_user_cleanup.sh --prefix e2e-
+#
+# Usage (apply):
+#   scripts/test_vaultwarden_user_cleanup.sh --prefix e2e- --apply --confirm e2e-
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+
+python3 "${SCRIPT_DIR}/test_vaultwarden_user_cleanup.py" "$@"
+
--- a/scripts/tests/test_mailu_sync.py
+++ b/scripts/tests/test_mailu_sync.py
@ -20,7 +20,13 @@ def load_sync_module(monkeypatch):
    }
    for k, v in env.items():
        monkeypatch.setenv(k, v)
-    module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
+    module_path = (
+        pathlib.Path(__file__).resolve().parents[2]
+        / "services"
+        / "mailu"
+        / "scripts"
+        / "mailu_sync.py"
+    )
    spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
@ -102,7 +108,8 @@ def test_kc_get_users_paginates(monkeypatch):
    sync.SESSION = _PagedSession()
    users = sync.kc_get_users("tok")
    assert [u["id"] for u in users] == ["u1", "u2"]
-    assert sync.SESSION.calls == 2
+    # Pagination stops when results < page size.
+    assert sync.SESSION.calls == 1


 def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
@ -119,6 +126,7 @@ def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):

 def test_ensure_mailu_user_upserts(monkeypatch):
    sync = load_sync_module(monkeypatch)
+    monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
    captured = {}

    class _Cursor:
@ -134,6 +142,7 @@ def test_ensure_mailu_user_upserts(monkeypatch):

 def test_main_generates_password_and_upserts(monkeypatch):
    sync = load_sync_module(monkeypatch)
+    monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
    users = [
        {"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
        {"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
@ -176,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):

    sync.main()

-    # Should attempt two inserts (third user skipped due to domain mismatch)
-    assert len(updated) == 1  # only one missing attr was backfilled
-    assert conns and len(conns[0]._cursor.executions) == 2
+    # Always backfill mailu_email, even if Keycloak recovery email is external.
+    assert len(updated) == 3
+    assert conns and len(conns[0]._cursor.executions) == 3
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@ -0,0 +1,105 @@
+# services/ai-llm/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ollama
+  namespace: ai
+spec:
+  replicas: 1
+  revisionHistoryLimit: 2
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 1
+  selector:
+    matchLabels:
+      app: ollama
+  template:
+    metadata:
+      labels:
+        app: ollama
+      annotations:
+        ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
+        ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values:
+                      - titan-20
+                      - titan-21
+                      - titan-22
+                      - titan-24
+      runtimeClassName: nvidia
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: ollama-models
+      initContainers:
+        - name: warm-model
+          image: ollama/ollama:latest
+          env:
+            - name: OLLAMA_HOST
+              value: 0.0.0.0
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: all
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: compute,utility
+            - name: OLLAMA_MODELS
+              value: /root/.ollama
+            - name: OLLAMA_MODEL
+              value: qwen2.5-coder:7b-instruct-q4_0
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -e
+              ollama serve >/tmp/ollama.log 2>&1 &
+              sleep 6
+              ollama pull "${OLLAMA_MODEL}"
+              pkill ollama || true
+          volumeMounts:
+            - name: models
+              mountPath: /root/.ollama
+          resources:
+            requests:
+              cpu: 250m
+              memory: 1Gi
+              nvidia.com/gpu.shared: 1
+            limits:
+              nvidia.com/gpu.shared: 1
+      containers:
+        - name: ollama
+          image: ollama/ollama:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: http
+              containerPort: 11434
+          env:
+            - name: OLLAMA_HOST
+              value: 0.0.0.0
+            - name: OLLAMA_KEEP_ALIVE
+              value: 6h
+            - name: OLLAMA_MODELS
+              value: /root/.ollama
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: all
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: compute,utility
+          volumeMounts:
+            - name: models
+              mountPath: /root/.ollama
+          resources:
+            requests:
+              cpu: "2"
+              memory: 8Gi
+              nvidia.com/gpu.shared: 1
+            limits:
+              cpu: "4"
+              memory: 12Gi
+              nvidia.com/gpu.shared: 1
--- a/services/ai-llm/kustomization.yaml
+++ b/services/ai-llm/kustomization.yaml
@ -0,0 +1,9 @@
+# services/ai-llm/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: ai
+resources:
+  - namespace.yaml
+  - pvc.yaml
+  - deployment.yaml
+  - service.yaml
--- a/services/ai-llm/namespace.yaml
+++ b/services/ai-llm/namespace.yaml
@ -0,0 +1,5 @@
+# services/ai-llm/namespace.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ai
--- a/services/ai-llm/pvc.yaml
+++ b/services/ai-llm/pvc.yaml
@ -0,0 +1,13 @@
+# services/ai-llm/pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ollama-models
+  namespace: ai
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 30Gi
+  storageClassName: astreae
--- a/services/ai-llm/service.yaml
+++ b/services/ai-llm/service.yaml
@ -0,0 +1,14 @@
+# services/ai-llm/service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: ollama
+  namespace: ai
+spec:
+  type: ClusterIP
+  selector:
+    app: ollama
+  ports:
+    - name: http
+      port: 11434
+      targetPort: 11434
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -5,7 +5,7 @@ metadata:
  name: bstein-dev-home-backend
  namespace: bstein-dev-home
 spec:
-  replicas: 2
+  replicas: 1
  revisionHistoryLimit: 3
  selector:
    matchLabels:
@ -15,6 +15,8 @@ spec:
      labels:
        app: bstein-dev-home-backend
    spec:
+      automountServiceAccountToken: true
+      serviceAccountName: bstein-dev-home
      nodeSelector:
        kubernetes.io/arch: arm64
        node-role.kubernetes.io/worker: "true"
@ -22,8 +24,73 @@ spec:
        - name: harbor-bstein-robot
      containers:
        - name: backend
-          image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest
+          image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
          imagePullPolicy: Always
+          command: ["gunicorn"]
+          args:
+            - -b
+            - 0.0.0.0:8080
+            - --workers
+            - "2"
+            - --timeout
+            - "180"
+            - app:app
+          env:
+            - name: AI_CHAT_API
+              value: http://ollama.ai.svc.cluster.local:11434
+            - name: AI_CHAT_MODEL
+              value: qwen2.5-coder:7b-instruct-q4_0
+            - name: AI_CHAT_TIMEOUT_SEC
+              value: "60"
+            - name: AI_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: AI_NODE_GPU_MAP
+              value: |
+                {"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
+            - name: KEYCLOAK_ENABLED
+              value: "true"
+            - name: KEYCLOAK_URL
+              value: https://sso.bstein.dev
+            - name: KEYCLOAK_REALM
+              value: atlas
+            - name: KEYCLOAK_CLIENT_ID
+              value: bstein-dev-home
+            - name: KEYCLOAK_ISSUER
+              value: https://sso.bstein.dev/realms/atlas
+            - name: KEYCLOAK_JWKS_URL
+              value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs
+            - name: KEYCLOAK_ADMIN_URL
+              value: http://keycloak.sso.svc.cluster.local
+            - name: KEYCLOAK_ADMIN_REALM
+              value: atlas
+            - name: KEYCLOAK_ADMIN_CLIENT_ID
+              value: bstein-dev-home-admin
+            - name: KEYCLOAK_ADMIN_CLIENT_SECRET
+              valueFrom:
+                secretKeyRef:
+                  name: bstein-dev-home-keycloak-admin
+                  key: client_secret
+            - name: ACCOUNT_ALLOWED_GROUPS
+              value: ""
+            - name: PORTAL_DATABASE_URL
+              valueFrom:
+                secretKeyRef:
+                  name: atlas-portal-db
+                  key: PORTAL_DATABASE_URL
+            - name: HTTP_CHECK_TIMEOUT_SEC
+              value: "2"
+            - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
+              value: "30"
+            - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC
+              value: "3600"
+            - name: ACCESS_REQUEST_STATUS_RATE_LIMIT
+              value: "120"
+            - name: ACCESS_REQUEST_STATUS_RATE_WINDOW_SEC
+              value: "60"
+            - name: ACCESS_REQUEST_INTERNAL_EMAIL_ALLOWLIST
+              value: robotuser@bstein.dev
          ports:
            - name: http
              containerPort: 8080
@ -33,16 +100,18 @@ spec:
              port: http
            initialDelaySeconds: 2
            periodSeconds: 5
+            timeoutSeconds: 3
          livenessProbe:
            httpGet:
              path: /api/healthz
              port: http
            initialDelaySeconds: 10
            periodSeconds: 10
+            timeoutSeconds: 3
          resources:
            requests:
-              cpu: 50m
-              memory: 64Mi
+              cpu: 100m
+              memory: 128Mi
            limits:
-              cpu: 300m
-              memory: 256Mi
+              cpu: 500m
+              memory: 512Mi
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@ -0,0 +1,69 @@
+# services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chat-ai-gateway
+  namespace: bstein-dev-home
+spec:
+  replicas: 1
+  revisionHistoryLimit: 2
+  selector:
+    matchLabels:
+      app: chat-ai-gateway
+  template:
+    metadata:
+      labels:
+        app: chat-ai-gateway
+    spec:
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: gateway
+          image: python:3.11-slim
+          command: ["/bin/sh","-c"]
+          args:
+            - python /app/gateway.py
+          env:
+            - name: UPSTREAM_URL
+              value: http://bstein-dev-home-backend/api/chat
+            - name: CHAT_KEY_MATRIX
+              valueFrom:
+                secretKeyRef:
+                  name: chat-ai-keys-runtime
+                  key: matrix
+            - name: CHAT_KEY_HOMEPAGE
+              valueFrom:
+                secretKeyRef:
+                  name: chat-ai-keys-runtime
+                  key: homepage
+          ports:
+            - name: http
+              containerPort: 8080
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: http
+            initialDelaySeconds: 2
+            periodSeconds: 5
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          resources:
+            requests:
+              cpu: 20m
+              memory: 64Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          volumeMounts:
+            - name: code
+              mountPath: /app/gateway.py
+              subPath: gateway.py
+      volumes:
+        - name: code
+          configMap:
+            name: chat-ai-gateway
--- a/services/bstein-dev-home/chat-ai-gateway-service.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-service.yaml
@ -0,0 +1,13 @@
+# services/bstein-dev-home/chat-ai-gateway-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: chat-ai-gateway
+  namespace: bstein-dev-home
+spec:
+  selector:
+    app: chat-ai-gateway
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
--- a/services/bstein-dev-home/frontend-deployment.yaml
+++ b/services/bstein-dev-home/frontend-deployment.yaml
@ -5,7 +5,7 @@ metadata:
  name: bstein-dev-home-frontend
  namespace: bstein-dev-home
 spec:
-  replicas: 2
+  replicas: 1
  revisionHistoryLimit: 3
  selector:
    matchLabels:
@ -22,7 +22,7 @@ spec:
        - name: harbor-bstein-robot
      containers:
        - name: frontend
-          image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest
+          image: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
          imagePullPolicy: Always
          ports:
            - name: http
--- a/services/bstein-dev-home/ingress.yaml
+++ b/services/bstein-dev-home/ingress.yaml
@ -11,7 +11,7 @@ metadata:
    cert-manager.io/cluster-issuer: letsencrypt
 spec:
  tls:
-    - hosts: [ "bstein.dev" ]
+    - hosts: [ "bstein.dev", "chat.ai.bstein.dev" ]
      secretName: bstein-dev-home-tls
  rules:
    - host: bstein.dev
@ -29,3 +29,12 @@ spec:
              service:
                name: bstein-dev-home-frontend
                port: { number: 80 }
+    - host: chat.ai.bstein.dev
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: chat-ai-gateway
+                port: { number: 80 }
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@ -5,13 +5,38 @@ namespace: bstein-dev-home
 resources:
  - namespace.yaml
  - image.yaml
+  - rbac.yaml
+  - portal-e2e-client-secret-sync-rbac.yaml
+  - chat-ai-gateway-deployment.yaml
+  - chat-ai-gateway-service.yaml
  - frontend-deployment.yaml
  - frontend-service.yaml
  - backend-deployment.yaml
  - backend-service.yaml
+  - vaultwarden-cred-sync-cronjob.yaml
+  - portal-onboarding-e2e-test-job.yaml
  - ingress.yaml
 images:
  - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
+    newTag: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
  - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+    newTag: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+configMapGenerator:
+  - name: chat-ai-gateway
+    namespace: bstein-dev-home
+    files:
+      - gateway.py=scripts/gateway.py
+    options:
+      disableNameSuffixHash: true
+  - name: vaultwarden-cred-sync-script
+    namespace: bstein-dev-home
+    files:
+      - vaultwarden_cred_sync.py=scripts/vaultwarden_cred_sync.py
+    options:
+      disableNameSuffixHash: true
+  - name: portal-onboarding-e2e-tests
+    namespace: bstein-dev-home
+    files:
+      - test_portal_onboarding_flow.py=scripts/test_portal_onboarding_flow.py
+    options:
+      disableNameSuffixHash: true
--- a/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
+++ b/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
@ -0,0 +1,24 @@
+# services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: portal-e2e-client-secret-sync-target
+  namespace: bstein-dev-home
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get", "create", "patch", "update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: portal-e2e-client-secret-sync-target
+  namespace: bstein-dev-home
+subjects:
+  - kind: ServiceAccount
+    name: portal-e2e-client-secret-sync
+    namespace: sso
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: portal-e2e-client-secret-sync-target
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@ -0,0 +1,66 @@
+# services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: portal-onboarding-e2e-test-11
+  namespace: bstein-dev-home
+spec:
+  backoffLimit: 0
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: test
+          image: python:3.11-slim
+          env:
+            - name: PORTAL_BASE_URL
+              value: http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local
+            - name: KEYCLOAK_ADMIN_URL
+              value: https://sso.bstein.dev
+            - name: KEYCLOAK_REALM
+              value: atlas
+            - name: KEYCLOAK_ADMIN_CLIENT_ID
+              value: bstein-dev-home-admin
+            - name: KEYCLOAK_ADMIN_CLIENT_SECRET
+              valueFrom:
+                secretKeyRef:
+                  name: bstein-dev-home-keycloak-admin
+                  key: client_secret
+            - name: PORTAL_E2E_CLIENT_ID
+              valueFrom:
+                secretKeyRef:
+                  name: portal-e2e-client
+                  key: client_id
+            - name: PORTAL_E2E_CLIENT_SECRET
+              valueFrom:
+                secretKeyRef:
+                  name: portal-e2e-client
+                  key: client_secret
+            - name: PORTAL_TARGET_CLIENT_ID
+              value: bstein-dev-home
+            - name: E2E_PORTAL_ADMIN_USERNAME
+              value: bstein
+            - name: E2E_USERNAME_PREFIX
+              value: e2e-portal
+            - name: E2E_CONTACT_EMAIL
+              value: robotuser@bstein.dev
+            - name: E2E_IMAP_KEYCLOAK_USERNAME
+              value: robotuser
+            - name: E2E_DEADLINE_SECONDS
+              value: "600"
+            - name: E2E_POLL_SECONDS
+              value: "10"
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -euo pipefail
+              python /scripts/test_portal_onboarding_flow.py
+          volumeMounts:
+            - name: tests
+              mountPath: /scripts
+              readOnly: true
+      volumes:
+        - name: tests
+          configMap:
+            name: portal-onboarding-e2e-tests
+            defaultMode: 0555
--- a/services/bstein-dev-home/rbac.yaml
+++ b/services/bstein-dev-home/rbac.yaml
@ -0,0 +1,108 @@
+# services/bstein-dev-home/rbac.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: bstein-dev-home
+  namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: bstein-dev-home-ai-reader
+rules:
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+    resourceNames: []
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: bstein-dev-home-ai-reader
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: bstein-dev-home-ai-reader
+subjects:
+  - kind: ServiceAccount
+    name: bstein-dev-home
+    namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: bstein-dev-home-vaultwarden-admin-secret-reader
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get"]
+    resourceNames: ["vaultwarden-admin"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: bstein-dev-home-vaultwarden-admin-secret-reader
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: bstein-dev-home-vaultwarden-admin-secret-reader
+subjects:
+  - kind: ServiceAccount
+    name: bstein-dev-home
+    namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: bstein-dev-home-vaultwarden-admin-token-reader
+  namespace: vaultwarden
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get"]
+    resourceNames: ["vaultwarden-admin"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: bstein-dev-home-vaultwarden-admin-token-reader
+  namespace: vaultwarden
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-vaultwarden-admin-token-reader
+subjects:
+  - kind: ServiceAccount
+    name: bstein-dev-home
+    namespace: bstein-dev-home
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: bstein-dev-home-nextcloud-mail-sync
+  namespace: nextcloud
+rules:
+  - apiGroups: ["batch"]
+    resources: ["cronjobs"]
+    verbs: ["get"]
+    resourceNames: ["nextcloud-mail-sync"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["create", "get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: bstein-dev-home-nextcloud-mail-sync
+  namespace: nextcloud
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: bstein-dev-home-nextcloud-mail-sync
+subjects:
+  - kind: ServiceAccount
+    name: bstein-dev-home
+    namespace: bstein-dev-home
--- a/services/bstein-dev-home/scripts/gateway.py
+++ b/services/bstein-dev-home/scripts/gateway.py
@ -0,0 +1,70 @@
+import json
+import os
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from urllib import request, error
+
+UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
+KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
+KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
+
+ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
+
+class Handler(BaseHTTPRequestHandler):
+    def _send_json(self, code: int, payload: dict):
+        body = json.dumps(payload).encode()
+        self.send_response(code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def do_GET(self):  # noqa: N802
+        if self.path in ("/healthz", "/"):
+            return self._send_json(200, {"ok": True})
+        return self._send_json(404, {"error": "not_found"})
+
+    def do_POST(self):  # noqa: N802
+        if self.path != "/":
+            return self._send_json(404, {"error": "not_found"})
+
+        key = self.headers.get("x-api-key", "")
+        if not key or key not in ALLOWED:
+            return self._send_json(401, {"error": "unauthorized"})
+
+        length = int(self.headers.get("content-length", "0") or "0")
+        raw = self.rfile.read(length) if length else b"{}"
+
+        try:
+            upstream_req = request.Request(
+                UPSTREAM,
+                data=raw,
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            with request.urlopen(upstream_req, timeout=90) as resp:
+                data = resp.read()
+                self.send_response(resp.status)
+                for k, v in resp.headers.items():
+                    if k.lower() in ("content-length", "connection", "server", "date"):
+                        continue
+                    self.send_header(k, v)
+                self.send_header("Content-Length", str(len(data)))
+                self.end_headers()
+                self.wfile.write(data)
+        except error.HTTPError as e:
+            data = e.read() if hasattr(e, "read") else b""
+            self.send_response(e.code)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        except Exception:
+            return self._send_json(502, {"error": "bad_gateway"})
+
+def main():
+    port = int(os.environ.get("PORT", "8080"))
+    httpd = HTTPServer(("0.0.0.0", port), Handler)
+    httpd.serve_forever()
+
+if __name__ == "__main__":
+    main()
--- a/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
+++ b/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+import email
+import http.client
+import imaplib
+import json
+import os
+import re
+import ssl
+import sys
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+
+
+def _env(name: str, default: str | None = None) -> str:
+    value = os.environ.get(name, default)
+    if value is None or value == "":
+        raise SystemExit(f"missing required env var: {name}")
+    return value
+
+
+def _post_json(url: str, payload: dict, timeout_s: int = 30) -> dict:
+    body = json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url,
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
+            raw = resp.read().decode()
+            return json.loads(raw) if raw else {}
+    except urllib.error.HTTPError as exc:
+        raw = exc.read().decode(errors="replace")
+        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
+
+
+def _post_form(url: str, data: dict[str, str], timeout_s: int = 30) -> dict:
+    body = urllib.parse.urlencode(data).encode()
+    req = urllib.request.Request(
+        url,
+        data=body,
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
+            raw = resp.read().decode()
+            return json.loads(raw) if raw else {}
+    except urllib.error.HTTPError as exc:
+        raw = exc.read().decode(errors="replace")
+        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
+
+
+def _get_json(url: str, headers: dict[str, str] | None = None, timeout_s: int = 30) -> object:
+    req = urllib.request.Request(url, headers=headers or {}, method="GET")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
+            raw = resp.read().decode()
+            return json.loads(raw) if raw else None
+    except urllib.error.HTTPError as exc:
+        raw = exc.read().decode(errors="replace")
+        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
+
+
+def _request_json(
+    method: str,
+    url: str,
+    token: str,
+    payload: dict | None = None,
+    timeout_s: int = 30,
+) -> dict:
+    data = None
+    headers = {"Authorization": f"Bearer {token}"}
+    if payload is not None:
+        data = json.dumps(payload).encode()
+        headers["Content-Type"] = "application/json"
+    req = urllib.request.Request(url, data=data, headers=headers, method=method)
+    try:
+        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
+            raw = resp.read().decode()
+            return json.loads(raw) if raw else {}
+    except urllib.error.HTTPError as exc:
+        raw = exc.read().decode(errors="replace")
+        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
+
+
+def _keycloak_client_token(keycloak_base: str, realm: str, client_id: str, client_secret: str) -> str:
+    token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
+    payload = _post_form(
+        token_url,
+        {
+            "grant_type": "client_credentials",
+            "client_id": client_id,
+            "client_secret": client_secret,
+        },
+        timeout_s=20,
+    )
+    token = payload.get("access_token")
+    if not isinstance(token, str) or not token:
+        raise SystemExit("keycloak token response missing access_token")
+    return token
+
+
+def _keycloak_token_exchange(
+    *,
+    keycloak_base: str,
+    realm: str,
+    client_id: str,
+    client_secret: str,
+    subject_token: str,
+    requested_subject: str,
+    audience: str,
+) -> str:
+    token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
+    payload = _post_form(
+        token_url,
+        {
+            "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
+            "client_id": client_id,
+            "client_secret": client_secret,
+            "subject_token": subject_token,
+            "requested_subject": requested_subject,
+            "audience": audience,
+        },
+        timeout_s=20,
+    )
+    token = payload.get("access_token")
+    if not isinstance(token, str) or not token:
+        raise SystemExit("keycloak token exchange response missing access_token")
+    return token
+
+
+def _keycloak_find_user(keycloak_base: str, realm: str, token: str, username: str) -> dict | None:
+    url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users?{urllib.parse.urlencode({'username': username, 'exact': 'true', 'max': '1'})}"
+    users = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
+    if not isinstance(users, list) or not users:
+        return None
+    user = users[0]
+    return user if isinstance(user, dict) else None
+
+
+def _keycloak_get_user(keycloak_base: str, realm: str, token: str, user_id: str) -> dict:
+    url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users/{urllib.parse.quote(user_id, safe='')}"
+    data = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
+    if not isinstance(data, dict):
+        raise SystemExit("unexpected keycloak user payload")
+    return data
+
+
+def _extract_attr(attributes: object, key: str) -> str:
+    if not isinstance(attributes, dict):
+        return ""
+    value = attributes.get(key)
+    if isinstance(value, list) and value and isinstance(value[0], str):
+        return value[0]
+    if isinstance(value, str):
+        return value
+    return ""
+
+
+def _imap_wait_for_verify_token(
+    *,
+    host: str,
+    port: int,
+    username: str,
+    password: str,
+    request_code: str,
+    deadline_sec: int,
+) -> str:
+    ssl_context = ssl._create_unverified_context()
+    deadline_at = time.monotonic() + deadline_sec
+
+    with imaplib.IMAP4_SSL(host, port, ssl_context=ssl_context) as client:
+        client.login(username, password)
+        client.select("INBOX")
+
+        while time.monotonic() < deadline_at:
+            status, data = client.search(None, "TEXT", request_code)
+            if status == "OK" and data and data[0]:
+                ids = data[0].split()
+                msg_id = ids[-1]
+                fetch_status, msg_data = client.fetch(msg_id, "(RFC822)")
+                if fetch_status != "OK" or not msg_data:
+                    time.sleep(2)
+                    continue
+
+                raw = msg_data[0][1] if isinstance(msg_data[0], tuple) and len(msg_data[0]) > 1 else None
+                if not isinstance(raw, (bytes, bytearray)):
+                    time.sleep(2)
+                    continue
+
+                message = email.message_from_bytes(raw)
+                body = None
+                if message.is_multipart():
+                    for part in message.walk():
+                        if part.get_content_type() == "text/plain":
+                            payload = part.get_payload(decode=True)
+                            if isinstance(payload, (bytes, bytearray)):
+                                body = payload.decode(errors="replace")
+                                break
+                else:
+                    payload = message.get_payload(decode=True)
+                    if isinstance(payload, (bytes, bytearray)):
+                        body = payload.decode(errors="replace")
+
+                if not body:
+                    time.sleep(2)
+                    continue
+
+                url = None
+                for line in body.splitlines():
+                    candidate = line.strip()
+                    if "verify=" in candidate and candidate.startswith("http"):
+                        url = candidate
+                        break
+                if not url:
+                    match = re.search(r"https?://\\S+verify=\\S+", body)
+                    url = match.group(0) if match else None
+                if not url:
+                    time.sleep(2)
+                    continue
+
+                parsed = urllib.parse.urlparse(url)
+                query = urllib.parse.parse_qs(parsed.query)
+                token = query.get("verify", [""])[0]
+                if isinstance(token, str) and token:
+                    return token
+            time.sleep(2)
+
+    raise SystemExit("verification email not found before deadline")
+
+
+def main() -> int:
+    portal_base = _env("PORTAL_BASE_URL").rstrip("/")
+
+    keycloak_base = _env("KEYCLOAK_ADMIN_URL").rstrip("/")
+    realm = _env("KEYCLOAK_REALM", "atlas")
+    kc_admin_client_id = _env("KEYCLOAK_ADMIN_CLIENT_ID")
+    kc_admin_client_secret = _env("KEYCLOAK_ADMIN_CLIENT_SECRET")
+    portal_e2e_client_id = _env("PORTAL_E2E_CLIENT_ID")
+    portal_e2e_client_secret = _env("PORTAL_E2E_CLIENT_SECRET")
+    portal_target_client_id = os.environ.get("PORTAL_TARGET_CLIENT_ID", "bstein-dev-home").strip() or "bstein-dev-home"
+    portal_admin_username = os.environ.get("E2E_PORTAL_ADMIN_USERNAME", "bstein").strip() or "bstein"
+
+    contact_email = os.environ.get("E2E_CONTACT_EMAIL", "robotuser@bstein.dev").strip()
+    if not contact_email:
+        raise SystemExit("E2E_CONTACT_EMAIL must not be empty")
+
+    imap_host = os.environ.get("E2E_IMAP_HOST", "mailu-front.mailu-mailserver.svc.cluster.local").strip()
+    imap_port = int(os.environ.get("E2E_IMAP_PORT", "993"))
+    imap_keycloak_username = os.environ.get("E2E_IMAP_KEYCLOAK_USERNAME", "robotuser").strip()
+    imap_wait_sec = int(os.environ.get("E2E_IMAP_WAIT_SECONDS", "90"))
+
+    try:
+        token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
+    except SystemExit as exc:
+        raise SystemExit(f"failed to fetch keycloak token for admin client {kc_admin_client_id!r}: {exc}")
+    mailbox_user = _keycloak_find_user(keycloak_base, realm, token, imap_keycloak_username)
+    if not mailbox_user:
+        raise SystemExit(f"unable to locate Keycloak mailbox user {imap_keycloak_username!r}")
+    mailbox_user_id = mailbox_user.get("id")
+    if not isinstance(mailbox_user_id, str) or not mailbox_user_id:
+        raise SystemExit("mailbox user missing id")
+
+    mailbox_full = _keycloak_get_user(keycloak_base, realm, token, mailbox_user_id)
+    mailbox_attrs = mailbox_full.get("attributes")
+    mailu_email = _extract_attr(mailbox_attrs, "mailu_email")
+    if not mailu_email:
+        mailu_email = contact_email
+    mailu_password = _extract_attr(mailbox_attrs, "mailu_app_password")
+    if not mailu_password:
+        raise SystemExit(f"Keycloak user {imap_keycloak_username!r} missing mailu_app_password attribute")
+
+    username_prefix = os.environ.get("E2E_USERNAME_PREFIX", "e2e-user")
+    now = int(time.time())
+    username = f"{username_prefix}-{now}"
+
+    submit_url = f"{portal_base}/api/access/request"
+    submit_payload = {"username": username, "email": contact_email, "note": "portal onboarding e2e"}
+    submit = None
+    for attempt in range(1, 6):
+        try:
+            submit = _post_json(submit_url, submit_payload, timeout_s=20)
+            break
+        except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
+            if attempt == 5:
+                raise SystemExit(f"portal submit failed after {attempt} attempts: {exc}")
+            time.sleep(2)
+    if not isinstance(submit, dict):
+        raise SystemExit("portal submit did not return json")
+
+    request_code = submit.get("request_code")
+    if not isinstance(request_code, str) or not request_code:
+        raise SystemExit(f"request submit did not return request_code: {submit}")
+
+    verify_token = _imap_wait_for_verify_token(
+        host=imap_host,
+        port=imap_port,
+        username=mailu_email,
+        password=mailu_password,
+        request_code=request_code,
+        deadline_sec=imap_wait_sec,
+    )
+    verify_resp = _post_json(
+        f"{portal_base}/api/access/request/verify",
+        {"request_code": request_code, "token": verify_token},
+        timeout_s=30,
+    )
+    if not isinstance(verify_resp, dict) or verify_resp.get("ok") is not True:
+        raise SystemExit(f"unexpected verify response: {verify_resp}")
+
+    portal_admin = _keycloak_find_user(keycloak_base, realm, token, portal_admin_username)
+    if not portal_admin:
+        raise SystemExit(f"unable to locate portal admin user {portal_admin_username!r} via Keycloak admin API")
+    portal_admin_user_id = portal_admin.get("id")
+    if not isinstance(portal_admin_user_id, str) or not portal_admin_user_id:
+        raise SystemExit("portal admin user missing id")
+
+    try:
+        e2e_subject_token = _keycloak_client_token(keycloak_base, realm, portal_e2e_client_id, portal_e2e_client_secret)
+    except SystemExit as exc:
+        raise SystemExit(f"failed to fetch keycloak token for E2E client {portal_e2e_client_id!r}: {exc}")
+    try:
+        portal_bearer = _keycloak_token_exchange(
+            keycloak_base=keycloak_base,
+            realm=realm,
+            client_id=portal_e2e_client_id,
+            client_secret=portal_e2e_client_secret,
+            subject_token=e2e_subject_token,
+            requested_subject=portal_admin_user_id,
+            audience=portal_target_client_id,
+        )
+    except SystemExit as exc:
+        raise SystemExit(f"failed to exchange token for portal approval as {portal_admin_username!r}: {exc}")
+
+    approve_url = f"{portal_base}/api/admin/access/requests/{urllib.parse.quote(username, safe='')}/approve"
+    approve_timeout_s = int(os.environ.get("E2E_APPROVE_TIMEOUT_SECONDS", "180"))
+    approve_attempts = int(os.environ.get("E2E_APPROVE_ATTEMPTS", "3"))
+    approve_resp = None
+    approve_error = None
+    for attempt in range(1, approve_attempts + 1):
+        try:
+            approve_resp = _request_json("POST", approve_url, portal_bearer, payload=None, timeout_s=approve_timeout_s)
+            approve_error = None
+            break
+        except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
+            approve_error = str(exc)
+            if attempt == approve_attempts:
+                break
+            time.sleep(3)
+    if approve_resp is None:
+        print(
+            "WARNING: portal approval request did not return a response; "
+            f"continuing to poll status (last_error={approve_error})"
+        )
+    elif not isinstance(approve_resp, dict) or approve_resp.get("ok") is not True:
+        raise SystemExit(f"unexpected approval response: {approve_resp}")
+
+    status_url = f"{portal_base}/api/access/request/status"
+    deadline_s = int(os.environ.get("E2E_DEADLINE_SECONDS", "600"))
+    interval_s = int(os.environ.get("E2E_POLL_SECONDS", "10"))
+    deadline_at = time.monotonic() + deadline_s
+
+    last_status = None
+    last_error = None
+    while True:
+        try:
+            status_payload = _post_json(status_url, {"request_code": request_code}, timeout_s=60)
+            last_error = None
+        except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
+            last_error = str(exc)
+            if time.monotonic() >= deadline_at:
+                raise SystemExit(f"timed out waiting for provisioning to finish (last error={last_error})")
+            time.sleep(interval_s)
+            continue
+        status = status_payload.get("status")
+        if isinstance(status, str):
+            last_status = status
+
+        if status in ("awaiting_onboarding", "ready"):
+            break
+        if status in ("denied", "unknown"):
+            raise SystemExit(f"request transitioned to unexpected terminal status: {status_payload}")
+        if time.monotonic() >= deadline_at:
+            suffix = f" (last error={last_error})" if last_error else ""
+            raise SystemExit(f"timed out waiting for provisioning to finish (last status={last_status}){suffix}")
+        time.sleep(interval_s)
+
+    # Refresh admin token (it may expire during the provisioning wait).
+    token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
+
+    user = _keycloak_find_user(keycloak_base, realm, token, username)
+    if not user:
+        raise SystemExit("expected Keycloak user was not created")
+    user_id = user.get("id")
+    if not isinstance(user_id, str) or not user_id:
+        raise SystemExit("created user missing id")
+
+    full = _keycloak_get_user(keycloak_base, realm, token, user_id)
+    required_actions = full.get("requiredActions") or []
+    required: set[str] = set()
+    if isinstance(required_actions, list):
+        required = {a for a in required_actions if isinstance(a, str)}
+
+    unexpected = sorted(required.intersection({"UPDATE_PASSWORD", "VERIFY_EMAIL", "CONFIGURE_TOTP"}))
+    if unexpected:
+        raise SystemExit(
+            "Keycloak user should not require actions at first login "
+            f"(Vaultwarden-first onboarding): unexpected requiredActions={unexpected} full={sorted(required)}"
+        )
+
+    email_verified = full.get("emailVerified")
+    if email_verified is not True:
+        raise SystemExit(f"Keycloak user should have emailVerified=true: emailVerified={email_verified!r}")
+
+    kc_email = full.get("email")
+    if isinstance(kc_email, str) and contact_email and kc_email != contact_email:
+        raise SystemExit(f"Keycloak user email mismatch: expected {contact_email!r} got {kc_email!r}")
+
+    print(f"PASS: onboarding provisioning completed for {request_code} ({username})")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
+++ b/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import sys
+import time
+from typing import Any, Iterable
+
+import httpx
+
+from atlas_portal import settings
+from atlas_portal.keycloak import admin_client
+from atlas_portal.vaultwarden import invite_user
+
+
+VAULTWARDEN_EMAIL_ATTR = "vaultwarden_email"
+VAULTWARDEN_STATUS_ATTR = "vaultwarden_status"
+VAULTWARDEN_SYNCED_AT_ATTR = "vaultwarden_synced_at"
+
+
+def _iter_keycloak_users(page_size: int = 200) -> Iterable[dict[str, Any]]:
+    client = admin_client()
+    if not client.ready():
+        raise RuntimeError("keycloak admin client not configured")
+
+    url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
+    first = 0
+    while True:
+        headers = client.headers()
+        # We need attributes for idempotency (vaultwarden_status/vaultwarden_email). Keycloak defaults to a
+        # brief representation which may omit these.
+        params = {"first": str(first), "max": str(page_size), "briefRepresentation": "false"}
+        with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
+            resp = http.get(url, params=params, headers=headers)
+            resp.raise_for_status()
+            payload = resp.json()
+
+        if not isinstance(payload, list) or not payload:
+            return
+
+        for item in payload:
+            if isinstance(item, dict):
+                yield item
+
+        if len(payload) < page_size:
+            return
+        first += page_size
+
+
+def _extract_attr(attrs: Any, key: str) -> str:
+    if not isinstance(attrs, dict):
+        return ""
+    raw = attrs.get(key)
+    if isinstance(raw, list):
+        for item in raw:
+            if isinstance(item, str) and item.strip():
+                return item.strip()
+        return ""
+    if isinstance(raw, str) and raw.strip():
+        return raw.strip()
+    return ""
+
+
+def _vaultwarden_email_for_user(user: dict[str, Any]) -> str:
+    username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
+    username = username.strip()
+    if not username:
+        return ""
+
+    attrs = user.get("attributes")
+    vaultwarden_email = _extract_attr(attrs, VAULTWARDEN_EMAIL_ATTR)
+    if vaultwarden_email:
+        return vaultwarden_email
+
+    mailu_email = _extract_attr(attrs, "mailu_email")
+    if mailu_email:
+        return mailu_email
+
+    email = (user.get("email") if isinstance(user.get("email"), str) else "") or ""
+    email = email.strip()
+    if email and email.lower().endswith(f"@{settings.MAILU_DOMAIN.lower()}"):
+        return email
+
+    # Don't guess an internal mailbox address until Mailu sync has run and stored mailu_email.
+    # This avoids spamming Vaultwarden invites that can never be delivered (unknown recipient).
+    return ""
+
+
+def _set_user_attribute_if_missing(username: str, user: dict[str, Any], key: str, value: str) -> None:
+    value = (value or "").strip()
+    if not value:
+        return
+    existing = _extract_attr(user.get("attributes"), key)
+    if existing:
+        return
+    admin_client().set_user_attribute(username, key, value)
+
+
+def _set_user_attribute(username: str, key: str, value: str) -> None:
+    value = (value or "").strip()
+    if not value:
+        return
+    admin_client().set_user_attribute(username, key, value)
+
+
+def main() -> int:
+    processed = 0
+    created = 0
+    skipped = 0
+    failures = 0
+
+    for user in _iter_keycloak_users():
+        username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
+        username = username.strip()
+        if not username:
+            skipped += 1
+            continue
+
+        enabled = user.get("enabled")
+        if enabled is False:
+            skipped += 1
+            continue
+
+        if user.get("serviceAccountClientId") or username.startswith("service-account-"):
+            skipped += 1
+            continue
+
+        # Fetch the full user payload so we can reliably read attributes (and skip re-invites).
+        user_id = (user.get("id") if isinstance(user.get("id"), str) else "") or ""
+        user_id = user_id.strip()
+        full_user = user
+        if user_id:
+            try:
+                full_user = admin_client().get_user(user_id)
+            except Exception:
+                full_user = user
+
+        current_status = _extract_attr(full_user.get("attributes"), VAULTWARDEN_STATUS_ATTR)
+        current_synced_at = _extract_attr(full_user.get("attributes"), VAULTWARDEN_SYNCED_AT_ATTR)
+        email = _vaultwarden_email_for_user(full_user)
+        if not email:
+            print(f"skip {username}: missing email", file=sys.stderr)
+            skipped += 1
+            continue
+
+        try:
+            _set_user_attribute_if_missing(username, full_user, VAULTWARDEN_EMAIL_ATTR, email)
+        except Exception:
+            pass
+
+        # If we've already successfully invited or confirmed presence, do not re-invite on every cron run.
+        # Vaultwarden returns 409 for "already exists", which is idempotent but noisy and can trigger rate limits.
+        if current_status in {"invited", "already_present"}:
+            if not current_synced_at:
+                try:
+                    _set_user_attribute(
+                        username,
+                        VAULTWARDEN_SYNCED_AT_ATTR,
+                        time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+                    )
+                except Exception:
+                    pass
+            skipped += 1
+            continue
+
+        processed += 1
+        result = invite_user(email)
+        if result.ok:
+            created += 1
+            print(f"ok {username}: {result.status}")
+            try:
+                _set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
+                _set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
+            except Exception:
+                pass
+        else:
+            failures += 1
+            print(f"err {username}: {result.status} {result.detail}", file=sys.stderr)
+            try:
+                _set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
+                _set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
+            except Exception:
+                pass
+
+    print(
+        f"done processed={processed} created_or_present={created} skipped={skipped} failures={failures}",
+        file=sys.stderr,
+    )
+    return 0 if failures == 0 else 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
+++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
@ -0,0 +1,59 @@
+# services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: vaultwarden-cred-sync
+  namespace: bstein-dev-home
+spec:
+  schedule: "*/15 * * * *"
+  concurrencyPolicy: Forbid
+  successfulJobsHistoryLimit: 1
+  failedJobsHistoryLimit: 3
+  jobTemplate:
+    spec:
+      backoffLimit: 0
+      template:
+        spec:
+          serviceAccountName: bstein-dev-home
+          restartPolicy: Never
+          nodeSelector:
+            kubernetes.io/arch: arm64
+            node-role.kubernetes.io/worker: "true"
+          imagePullSecrets:
+            - name: harbor-bstein-robot
+          containers:
+            - name: sync
+              image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+              imagePullPolicy: Always
+              command:
+                - python
+                - /scripts/vaultwarden_cred_sync.py
+              env:
+                - name: PYTHONPATH
+                  value: /app
+                - name: KEYCLOAK_ENABLED
+                  value: "true"
+                - name: KEYCLOAK_REALM
+                  value: atlas
+                - name: KEYCLOAK_ADMIN_URL
+                  value: http://keycloak.sso.svc.cluster.local
+                - name: KEYCLOAK_ADMIN_REALM
+                  value: atlas
+                - name: KEYCLOAK_ADMIN_CLIENT_ID
+                  value: bstein-dev-home-admin
+                - name: KEYCLOAK_ADMIN_CLIENT_SECRET
+                  valueFrom:
+                    secretKeyRef:
+                      name: bstein-dev-home-keycloak-admin
+                      key: client_secret
+                - name: HTTP_CHECK_TIMEOUT_SEC
+                  value: "20"
+              volumeMounts:
+                - name: vaultwarden-cred-sync-script
+                  mountPath: /scripts
+                  readOnly: true
+          volumes:
+            - name: vaultwarden-cred-sync-script
+              configMap:
+                name: vaultwarden-cred-sync-script
+                defaultMode: 0555
--- a/services/ci-demo/deployment.yaml
+++ b/services/ci-demo/deployment.yaml
@ -1,31 +0,0 @@
-# services/ci-demo/deployment.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: ci-demo
-  namespace: ci-demo
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: ci-demo
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: ci-demo
-    spec:
-      nodeSelector:
-        hardware: rpi4
-      containers:
-        - name: ci-demo
-          image: registry.bstein.dev/infra/ci-demo:latest
-          ports:
-            - name: http
-              containerPort: 8080
-          readinessProbe:
-            httpGet:
-              path: /
-              port: http
-            initialDelaySeconds: 2
-            periodSeconds: 5
-
--- a/services/ci-demo/image.yaml
+++ b/services/ci-demo/image.yaml
@ -1,24 +0,0 @@
-# services/ci-demo/image.yaml
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImageRepository
-metadata:
-  name: ci-demo
-  namespace: flux-system
-spec:
-  image: registry.bstein.dev/infra/ci-demo
-  interval: 1m0s
---
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImagePolicy
-metadata:
-  name: ci-demo
-  namespace: flux-system
-spec:
-  imageRepositoryRef:
-    name: ci-demo
-  filterTags:
-    pattern: '^v(?P<version>0\.0\.0-\d+)$'
-    extract: '$version'
-  policy:
-    semver:
-      range: ">=0.0.0-0"
--- a/services/ci-demo/kustomization.yaml
+++ b/services/ci-demo/kustomization.yaml
@ -1,11 +0,0 @@
-# services/ci-demo/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - image.yaml
-  - deployment.yaml
-  - service.yaml
-images:
-  - name: registry.bstein.dev/infra/ci-demo
-    newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"}
--- a/services/ci-demo/namespace.yaml
+++ b/services/ci-demo/namespace.yaml
@ -1,6 +0,0 @@
-# services/ci-demo/namespace.yaml
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: ci-demo
-
--- a/services/comms/NOTES.md
+++ b/services/comms/NOTES.md
@ -0,0 +1,31 @@
+# services/comms/NOTES.md
+
+Purpose: Matrix + Element + LiveKit stack for Othrys (live.bstein.dev).
+
+Core flow
+- Matrix Authentication Service (MAS) handles login/SSO and issues Matrix access tokens.
+- Synapse is the homeserver; MAS fronts login, Synapse serves client/server APIs.
+- Element Web provides the main UI; Element Call embeds LiveKit for group video.
+- LiveKit handles SFU media; Coturn provides TURN for NAT traversal.
+- matrix-guest-register provisions MAS guest accounts and performs MAS password login to mint device-bound guest tokens (no Keycloak).
+
+Operational jobs
+- mas-db-ensure-job: ensures MAS database role/database + secret in comms.
+- comms-secrets-ensure-job: creates runtime secrets (TURN, LiveKit, Synapse, atlasbot).
+- synapse-signingkey-ensure-job: ensures Synapse signing key secret.
+- synapse-seeder-admin-ensure-job: ensures Synapse admin user exists.
+- synapse-user-seed-job: seeds atlasbot + othrys-seeder users/passwords.
+- mas-local-users-ensure-job: ensures MAS local users exist (seeder/bot).
+- seed-othrys-room: (suspended) creates Othrys + joins locals.
+- reset-othrys-room: suspended CronJob for a manual room reset + pin invite.
+- pin-othrys-invite: (suspended) pin invite message if missing.
+- guest-name-randomizer: renames numeric/guest users to adj-noun names.
+- bstein-force-leave: one-off room leave cleanup.
+
+Manual re-runs
+- Unsuspend a CronJob only when needed; re-suspend after completion.
+
+Ports
+- Traefik (HTTPS) via LB on 192.168.22.9.
+- Coturn LB on 192.168.22.5 (3478/5349 + UDP range).
+- LiveKit LB on 192.168.22.6 (7880/7881/7882/7883).
--- a/Show More
+++ b/Show More