2026-01-13 20:23:26 +00:00
325 changed files with 37644 additions and 1317 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,8 @@
 *.md
 !README.md
 !knowledge/**/*.md
 !services/comms/knowledge/**/*.md
 __pycache__/
 *.py[cod]
 .pytest_cache
 .venv
--- a/clusters/atlas/applications/kustomization.yaml
+++ b/clusters/atlas/applications/kustomization.yaml
@ -5,8 +5,9 @@ resources:
  - ../../services/crypto
  - ../../services/gitea
  - ../../services/jellyfin
-  - ../../services/jitsi
+  - ../../services/comms
  - ../../services/monitoring
  - ../../services/logging
  - ../../services/pegasus
  - ../../services/vault
  - ../../services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
@ -0,0 +1,23 @@
 # clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: ai-llm
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/ai-llm
  targetNamespace: ai
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  wait: true
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: ollama
      namespace: ai
  dependsOn:
    - name: core
--- a/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
@ -1,26 +0,0 @@
 # clusters/atlas/flux-system/applications/ci-demo/image-automation.yaml
 apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
  name: ci-demo
  namespace: flux-system
 spec:
  interval: 1m0s
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  git:
    checkout:
      ref:
        branch: feature/ci-gitops
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(ci-demo): apply image updates"
    push:
      branch: feature/ci-gitops
  update:
    strategy: Setters
    path: services/ci-demo
--- a/clusters/atlas/flux-system/applications/comms/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/comms/kustomization.yaml
@ -0,0 +1,17 @@
 # clusters/atlas/flux-system/applications/communication/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: comms
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  path: ./services/comms
  targetNamespace: comms
  timeout: 2m
  dependsOn:
    - name: traefik
--- a/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/jellyfin/kustomization.yaml
@ -15,5 +15,6 @@ spec:
    namespace: flux-system
  dependsOn:
    - name: core
    - name: openldap
  wait: true
  timeout: 5m
--- a/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/jenkins/kustomization.yaml
@ -16,8 +16,12 @@ spec:
    - name: helm
    - name: traefik
  healthChecks:
-    - apiVersion: helm.toolkit.fluxcd.io/v2
+    - apiVersion: apps/v1
-      kind: HelmRelease
+      kind: Deployment
      name: jenkins
      namespace: jenkins
    - apiVersion: v1
      kind: Service
      name: jenkins
      namespace: jenkins
  wait: false
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -4,7 +4,8 @@ kind: Kustomization
 resources:
  - gitea/kustomization.yaml
  - vault/kustomization.yaml
-  - jitsi/kustomization.yaml
+  - vaultwarden/kustomization.yaml
  - comms/kustomization.yaml
  - crypto/kustomization.yaml
  - monerod/kustomization.yaml
  - pegasus/kustomization.yaml
@ -16,9 +17,14 @@ resources:
  - jellyfin/kustomization.yaml
  - xmr-miner/kustomization.yaml
  - sui-metrics/kustomization.yaml
  - openldap/kustomization.yaml
  - keycloak/kustomization.yaml
  - oauth2-proxy/kustomization.yaml
  - mailu/kustomization.yaml
  - jenkins/kustomization.yaml
-  - ci-demo/kustomization.yaml
+  - ai-llm/kustomization.yaml
-  - ci-demo/image-automation.yaml
+  - nextcloud/kustomization.yaml
  - nextcloud-mail-sync/kustomization.yaml
  - postgres/kustomization.yaml
  - outline/kustomization.yaml
  - planka/kustomization.yaml
--- a/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
@ -0,0 +1,17 @@
 # clusters/atlas/flux-system/applications/nextcloud-mail-sync/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: nextcloud-mail-sync
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  path: ./services/nextcloud-mail-sync
  targetNamespace: nextcloud
  timeout: 2m
  dependsOn:
    - name: keycloak
--- a/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
@ -0,0 +1,16 @@
 # clusters/atlas/flux-system/applications/nextcloud/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1beta2
 kind: Kustomization
 metadata:
  name: nextcloud
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/nextcloud
  targetNamespace: nextcloud
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  wait: true
--- a/clusters/atlas/flux-system/applications/openldap/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/openldap/kustomization.yaml
@ -1,18 +1,18 @@
-# clusters/atlas/flux-system/applications/jitsi/kustomization.yaml
+# clusters/atlas/flux-system/applications/openldap/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
-  name: jitsi
+  name: openldap
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/jitsi
  targetNamespace: jitsi
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  path: ./services/openldap
  targetNamespace: sso
  dependsOn:
    - name: core
  wait: true
--- a/clusters/atlas/flux-system/applications/outline/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/outline/kustomization.yaml
@ -0,0 +1,28 @@
 # clusters/atlas/flux-system/applications/outline/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: outline
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/outline
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  targetNamespace: outline
  dependsOn:
    - name: keycloak
    - name: mailu
    - name: traefik
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: outline
      namespace: outline
    - apiVersion: v1
      kind: Service
      name: outline
      namespace: outline
  wait: false
--- a/clusters/atlas/flux-system/applications/planka/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/planka/kustomization.yaml
@ -0,0 +1,28 @@
 # clusters/atlas/flux-system/applications/planka/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: planka
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/planka
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  targetNamespace: planka
  dependsOn:
    - name: keycloak
    - name: mailu
    - name: traefik
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: planka
      namespace: planka
    - apiVersion: v1
      kind: Service
      name: planka
      namespace: planka
  wait: false
--- a/clusters/atlas/flux-system/applications/postgres/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/postgres/kustomization.yaml
@ -0,0 +1,24 @@
 # clusters/atlas/flux-system/applications/postgres/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: postgres
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/postgres
  prune: true
  force: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  targetNamespace: postgres
  dependsOn:
    - name: vault
    - name: vault-csi
  healthChecks:
    - apiVersion: apps/v1
      kind: StatefulSet
      name: postgres
      namespace: postgres
  wait: true
--- a/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
@ -0,0 +1,20 @@
 # clusters/atlas/flux-system/applications/vaultwarden/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: vaultwarden
  namespace: flux-system
 spec:
  interval: 10m
  suspend: false
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  path: ./services/vaultwarden
  targetNamespace: vaultwarden
  prune: true
  wait: true
  dependsOn:
    - name: helm
    - name: traefik
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@ -8,7 +8,7 @@ metadata:
 spec:
  interval: 1m0s
  ref:
-    branch: main
+    branch: feature/sso-hardening
  secretRef:
    name: flux-system-gitea
  url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@ -4,7 +4,11 @@ kind: Kustomization
 resources:
  - core/kustomization.yaml
  - helm/kustomization.yaml
  - metallb/kustomization.yaml
  - traefik/kustomization.yaml
  - gitops-ui/kustomization.yaml
  - monitoring/kustomization.yaml
  - logging/kustomization.yaml
  - maintenance/kustomization.yaml
  - longhorn-ui/kustomization.yaml
  - ../platform/vault-csi/kustomization.yaml
--- a/clusters/atlas/flux-system/platform/logging/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/logging/kustomization.yaml
@ -0,0 +1,14 @@
 # clusters/atlas/flux-system/platform/logging/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: logging
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/logging
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  wait: false
--- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
@ -1,17 +1,14 @@
-# clusters/atlas/flux-system/applications/ci-demo/kustomization.yaml
+# clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
-  name: ci-demo
+  name: maintenance
  namespace: flux-system
 spec:
  interval: 10m
-  path: ./services/ci-demo
+  path: ./services/maintenance
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  dependsOn:
    - name: core
  wait: false
--- a/clusters/atlas/flux-system/platform/metallb/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/metallb/kustomization.yaml
@ -0,0 +1,16 @@
 # clusters/atlas/flux-system/platform/metallb/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: metallb
  namespace: flux-system
 spec:
  interval: 30m
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  path: ./infrastructure/metallb
  prune: true
  wait: true
  targetNamespace: metallb-system
--- a/clusters/atlas/flux-system/platform/traefik/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/traefik/kustomization.yaml
@ -15,4 +15,5 @@ spec:
    namespace: flux-system
  dependsOn:
    - name: core
    - name: metallb
  wait: true
--- a/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
@ -0,0 +1,16 @@
 # clusters/atlas/flux-system/platform/vault-csi/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: vault-csi
  namespace: flux-system
 spec:
  interval: 30m
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  path: ./infrastructure/vault-csi
  prune: true
  wait: true
  targetNamespace: kube-system
--- a/clusters/atlas/platform/kustomization.yaml
+++ b/clusters/atlas/platform/kustomization.yaml
@ -5,3 +5,4 @@ resources:
  - ../../../infrastructure/modules/base
  - ../../../infrastructure/modules/profiles/atlas-ha
  - ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
  - ../../../infrastructure/metallb
--- a/dockerfiles/Dockerfile.data-prepper
+++ b/dockerfiles/Dockerfile.data-prepper
@ -0,0 +1,16 @@
 FROM --platform=$BUILDPLATFORM opensearchproject/data-prepper:2.8.0 AS source
 FROM --platform=$TARGETPLATFORM eclipse-temurin:17-jre
 ENV DATA_PREPPER_PATH=/usr/share/data-prepper
 RUN useradd -u 10001 -M -U -d / -s /usr/sbin/nologin data_prepper \
  && mkdir -p /var/log/data-prepper
 COPY --from=source /usr/share/data-prepper /usr/share/data-prepper
 RUN chown -R 10001:10001 /usr/share/data-prepper /var/log/data-prepper
 USER 10001
 WORKDIR /usr/share/data-prepper
 CMD ["bin/data-prepper"]
--- a/hosts/roles/titan_jh/tasks/main.yaml
+++ b/hosts/roles/titan_jh/tasks/main.yaml
@ -1,5 +1,18 @@
 # hosts/roles/titan_jh/tasks/main.yaml
 ---
 - name: Install node exporter
  ansible.builtin.package:
    name: prometheus-node-exporter
    state: present
  tags: ['jumphost', 'monitoring']
 - name: Enable node exporter
  ansible.builtin.service:
    name: prometheus-node-exporter
    enabled: true
    state: started
  tags: ['jumphost', 'monitoring']
 - name: Placeholder for jumphost hardening
  ansible.builtin.debug:
    msg: "Harden SSH, manage bastion tooling, and configure audit logging here."
--- a/infrastructure/metallb/ippool.yaml
+++ b/infrastructure/metallb/ippool.yaml
@ -0,0 +1,20 @@
 # infrastructure/metallb/ippool.yaml
 apiVersion: metallb.io/v1beta1
 kind: IPAddressPool
 metadata:
  name: communication-pool
  namespace: metallb-system
 spec:
  addresses:
    - 192.168.22.4-192.168.22.6
    - 192.168.22.9-192.168.22.9
  autoAssign: true
 ---
 apiVersion: metallb.io/v1beta1
 kind: L2Advertisement
 metadata:
  name: communication-adv
  namespace: metallb-system
 spec:
  ipAddressPools:
    - communication-pool
--- a/infrastructure/metallb/kustomization.yaml
+++ b/infrastructure/metallb/kustomization.yaml
@ -0,0 +1,10 @@
 # infrastructure/metallb/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - metallb-rendered.yaml
  - ippool.yaml
 patchesStrategicMerge:
  - patches/node-placement.yaml
  - patches/speaker-loglevel.yaml
--- a/infrastructure/metallb/metallb-rendered.yaml
+++ b/infrastructure/metallb/metallb-rendered.yaml
--- a/infrastructure/metallb/namespace.yaml
+++ b/infrastructure/metallb/namespace.yaml
@ -0,0 +1,5 @@
 # infrastructure/metallb/namespace.yaml
 apiVersion: v1
 kind: Namespace
 metadata:
  name: metallb-system
--- a/infrastructure/metallb/patches/node-placement.yaml
+++ b/infrastructure/metallb/patches/node-placement.yaml
@ -0,0 +1,27 @@
 # infrastructure/metallb/patches/node-placement.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: metallb-controller
  namespace: metallb-system
 spec:
  template:
    spec:
      containers:
        - name: controller
          args:
            - --port=7472
            - --log-level=info
            - --webhook-mode=enabled
            - --tls-min-version=VersionTLS12
            - --lb-class=metallb
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: hardware
                    operator: In
                    values:
                      - rpi4
                      - rpi5
--- a/infrastructure/metallb/patches/speaker-loglevel.yaml
+++ b/infrastructure/metallb/patches/speaker-loglevel.yaml
@ -0,0 +1,15 @@
 # infrastructure/metallb/patches/speaker-loglevel.yaml
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: metallb-speaker
  namespace: metallb-system
 spec:
  template:
    spec:
      containers:
        - name: speaker
          args:
            - --port=7472
            - --log-level=info
            - --lb-class=metallb
--- a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml
+++ b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml
@ -2,6 +2,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - ../components/device-plugin-config
  - ../components/device-plugin-jetson
  - ../components/device-plugin-minipc
  - ../components/device-plugin-tethys
--- a/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
@ -0,0 +1,15 @@
 # infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: nvidia-device-plugin-config
  namespace: kube-system
 data:
  config.yaml: |
    version: v1
    sharing:
      timeSlicing:
        renameByDefault: true
        resources:
          - name: nvidia.com/gpu
            replicas: 4
--- a/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
@ -0,0 +1,5 @@
 # infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - configmap.yaml
--- a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
@ -30,7 +30,8 @@ spec:
          imagePullPolicy: IfNotPresent
          args:
            - "--fail-on-init-error=false"
-            - "--device-list-strategy=envvar,cdi"
+            - "--device-list-strategy=envvar"
            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -41,7 +42,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
            - name: config
              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
        - name: config
          configMap:
            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
@ -32,6 +32,7 @@ spec:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar"
            - "--mig-strategy=none"
            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -42,7 +43,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
            - name: config
              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
        - name: config
          configMap:
            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
@ -33,6 +33,7 @@ spec:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar"
            - "--mig-strategy=none"
            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -43,7 +44,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
            - name: config
              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
        - name: config
          configMap:
            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
+++ b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
@ -2,4 +2,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - ../components/device-plugin-config
  - ../components/device-plugin-tethys
--- a/infrastructure/sources/helm/fluent-bit.yaml
+++ b/infrastructure/sources/helm/fluent-bit.yaml
@ -0,0 +1,9 @@
 # infrastructure/sources/helm/fluent-bit.yaml
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: fluent
  namespace: flux-system
 spec:
  interval: 1h
  url: https://fluent.github.io/helm-charts
--- a/infrastructure/sources/helm/kustomization.yaml
+++ b/infrastructure/sources/helm/kustomization.yaml
@ -2,11 +2,15 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - fluent-bit.yaml
  - grafana.yaml
  - hashicorp.yaml
  - jetstack.yaml
  - jenkins.yaml
  - mailu.yaml
  - opentelemetry.yaml
  - opensearch.yaml
  - harbor.yaml
  - prometheus.yaml
  - victoria-metrics.yaml
  - secrets-store-csi.yaml
--- a/infrastructure/sources/helm/opensearch.yaml
+++ b/infrastructure/sources/helm/opensearch.yaml
@ -0,0 +1,9 @@
 # infrastructure/sources/helm/opensearch.yaml
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: opensearch
  namespace: flux-system
 spec:
  interval: 1h
  url: https://opensearch-project.github.io/helm-charts
--- a/infrastructure/sources/helm/opentelemetry.yaml
+++ b/infrastructure/sources/helm/opentelemetry.yaml
@ -0,0 +1,9 @@
 # infrastructure/sources/helm/opentelemetry.yaml
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: opentelemetry
  namespace: flux-system
 spec:
  interval: 1h
  url: https://open-telemetry.github.io/opentelemetry-helm-charts
--- a/infrastructure/sources/helm/secrets-store-csi.yaml
+++ b/infrastructure/sources/helm/secrets-store-csi.yaml
@ -0,0 +1,9 @@
 # infrastructure/sources/helm/secrets-store-csi.yaml
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: secrets-store-csi-driver
  namespace: flux-system
 spec:
  interval: 1h
  url: https://kubernetes-sigs.github.io/secrets-store-csi-driver/charts
--- a/infrastructure/traefik/clusterrole.yaml
+++ b/infrastructure/traefik/clusterrole.yaml
@ -71,9 +71,10 @@ rules:
  - tlsoptions
  - tlsstores
  - serverstransports
  - serverstransporttcps
  - traefikservices
  - middlewaretcps
  verbs: 
  - get
  - list
  - watch
--- a/infrastructure/traefik/kustomization.yaml
+++ b/infrastructure/traefik/kustomization.yaml
@ -10,3 +10,4 @@ resources:
  - clusterrole.yaml
  - clusterrolebinding.yaml
  - service.yaml
  - traefik-service-lb.yaml
--- a/infrastructure/traefik/traefik-service-lb.yaml
+++ b/infrastructure/traefik/traefik-service-lb.yaml
@ -0,0 +1,24 @@
 # infrastructure/traefik/traefik-service-lb.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: traefik
  namespace: kube-system
  annotations:
    metallb.universe.tf/address-pool: communication-pool
 spec:
  type: LoadBalancer
  loadBalancerClass: metallb
  loadBalancerIP: 192.168.22.9
  ports:
    - name: web
      port: 80
      targetPort: web
      protocol: TCP
    - name: websecure
      port: 443
      targetPort: websecure
      protocol: TCP
  selector:
    app.kubernetes.io/instance: traefik-kube-system
    app.kubernetes.io/name: traefik
--- a/infrastructure/vault-csi/kustomization.yaml
+++ b/infrastructure/vault-csi/kustomization.yaml
@ -0,0 +1,6 @@
 # infrastructure/vault-csi/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - secrets-store-csi-driver.yaml
  - vault-csi-provider.yaml
--- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml
+++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
@ -0,0 +1,20 @@
 # infrastructure/vault-csi/secrets-store-csi-driver.yaml
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: secrets-store-csi-driver
  namespace: kube-system
 spec:
  interval: 15m
  chart:
    spec:
      chart: secrets-store-csi-driver
      version: "~1.3.0"
      sourceRef:
        kind: HelmRepository
        name: secrets-store-csi-driver
        namespace: flux-system
  values:
    syncSecret:
      enabled: true
    enableSecretRotation: false
--- a/infrastructure/vault-csi/vault-csi-provider.yaml
+++ b/infrastructure/vault-csi/vault-csi-provider.yaml
@ -0,0 +1,111 @@
 # infrastructure/vault-csi/vault-csi-provider.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: vault-csi-provider
  namespace: kube-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: vault-csi-provider-clusterrole
 rules:
  - apiGroups: [""]
    resources: ["serviceaccounts/token"]
    verbs: ["create"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: vault-csi-provider-clusterrolebinding
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: vault-csi-provider-clusterrole
 subjects:
  - kind: ServiceAccount
    name: vault-csi-provider
    namespace: kube-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: vault-csi-provider-role
  namespace: kube-system
 rules:
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["get"]
    resourceNames: ["vault-csi-provider-hmac-key"]
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["create"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: vault-csi-provider-rolebinding
  namespace: kube-system
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: vault-csi-provider-role
 subjects:
  - kind: ServiceAccount
    name: vault-csi-provider
    namespace: kube-system
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: vault-csi-provider
  namespace: kube-system
  labels: { app.kubernetes.io/name: vault-csi-provider }
 spec:
  updateStrategy:
    type: RollingUpdate
  selector:
    matchLabels: { app.kubernetes.io/name: vault-csi-provider }
  template:
    metadata:
      labels: { app.kubernetes.io/name: vault-csi-provider }
    spec:
      serviceAccountName: vault-csi-provider
      containers:
        - name: provider-vault-installer
          image: hashicorp/vault-csi-provider:1.7.0
          imagePullPolicy: IfNotPresent
          args:
            - -endpoint=/provider/vault.sock
            - -log-level=info
          resources:
            requests: { cpu: 50m, memory: 100Mi }
            limits:   { cpu: 50m, memory: 100Mi }
          volumeMounts:
            - { name: providervol, mountPath: "/provider" }
          livenessProbe:
            httpGet:
              path: "/health/ready"
              port: 8080
              scheme: "HTTP"
            failureThreshold: 2
            initialDelaySeconds: 5
            periodSeconds: 5
            successThreshold: 1
            timeoutSeconds: 3
          readinessProbe:
            httpGet:
              path: "/health/ready"
              port: 8080
              scheme: "HTTP"
            failureThreshold: 2
            initialDelaySeconds: 5
            periodSeconds: 5
            successThreshold: 1
            timeoutSeconds: 3
      volumes:
        - name: providervol
          hostPath:
            path: "/var/run/secrets-store-csi-providers"
      nodeSelector:
        kubernetes.io/os: linux
--- a/knowledge/INDEX.md
+++ b/knowledge/INDEX.md
@ -0,0 +1,22 @@
 Atlas Knowledge Base (KB)
 This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
 - Accurate (grounded in GitOps + read-only cluster tools)
 - Maintainable (small docs + deterministic generators)
 - Safe (no secrets; refer to Secret/Vault paths by name only)
 Layout
 - `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
 - `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
 - `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
 Regeneration
 - Update manifests/docs, then regenerate generated artifacts:
  - `python scripts/knowledge_render_atlas.py --write`
 Authoring rules
 - Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
 - Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
 - Keep each runbook small; one topic per file; use headings.
 - When in doubt, link to the exact file path in this repo that configures the behavior.
--- a/knowledge/catalog/atlas-summary.json
+++ b/knowledge/catalog/atlas-summary.json
@ -0,0 +1,8 @@
 {
  "counts": {
    "helmrelease_host_hints": 7,
    "http_endpoints": 35,
    "services": 44,
    "workloads": 49
  }
 }
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
--- a/knowledge/catalog/runbooks.json
+++ b/knowledge/catalog/runbooks.json
@ -0,0 +1,89 @@
 [
  {
    "path": "runbooks/ci-gitea-jenkins.md",
    "title": "CI: Gitea \u2192 Jenkins pipeline",
    "tags": [
      "atlas",
      "ci",
      "gitea",
      "jenkins"
    ],
    "entrypoints": [
      "scm.bstein.dev",
      "ci.bstein.dev"
    ],
    "source_paths": [
      "services/gitea",
      "services/jenkins",
      "scripts/jenkins_cred_sync.sh",
      "scripts/gitea_cred_sync.sh"
    ],
    "body": "# CI: Gitea \u2192 Jenkins pipeline\n\n## What this is\nAtlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).\n\n## Where it is configured\n- Gitea manifests: `services/gitea/`\n- Jenkins manifests: `services/jenkins/`\n- Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`\n\n## What users do (typical flow)\n- Create a repo in Gitea.\n- Create/update a Jenkins job/pipeline that can fetch the repo.\n- Configure a webhook (or SCM polling) so pushes trigger builds.\n\n## Troubleshooting (common)\n- \u201cWebhook not firing\u201d: confirm ingress host, webhook URL, and Jenkins job is reachable.\n- \u201cAuth denied cloning\u201d: confirm Keycloak group membership and that Jenkins has a valid token/credential configured."
  },
  {
    "path": "runbooks/comms-verify.md",
    "title": "Othrys verification checklist",
    "tags": [
      "comms",
      "matrix",
      "element",
      "livekit"
    ],
    "entrypoints": [
      "https://live.bstein.dev",
      "https://matrix.live.bstein.dev"
    ],
    "source_paths": [],
    "body": "1) Guest join:\n- Open a private window and visit:\n  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`\n- Confirm the guest join flow works and the displayname becomes `<word>-<word>`.\n\n2) Keycloak login:\n- Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.\n\n3) Video rooms:\n- Start an Element Call room and confirm audio/video with a second account.\n- Check that guests can read public rooms but cannot start calls.\n\n4) Well-known:\n- `https://live.bstein.dev/.well-known/matrix/client` returns JSON.\n- `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.\n\n5) TURN reachability:\n- Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN."
  },
  {
    "path": "runbooks/kb-authoring.md",
    "title": "KB authoring: what to write (and what not to)",
    "tags": [
      "atlas",
      "kb",
      "runbooks"
    ],
    "entrypoints": [],
    "source_paths": [
      "knowledge/runbooks",
      "scripts/knowledge_render_atlas.py"
    ],
    "body": "# KB authoring: what to write (and what not to)\n\n## The goal\nGive Atlas assistants enough grounded, Atlas-specific context to answer \u201chow do I\u2026?\u201d questions without guessing.\n\n## What to capture (high value)\n- User workflows: \u201cclick here, set X, expected result\u201d\n- Operator workflows: \u201cedit these files, reconcile this kustomization, verify with these commands\u201d\n- Wiring: \u201cthis host routes to this service; this service depends on Postgres/Vault/etc\u201d\n- Failure modes: exact error messages + the 2\u20135 checks that usually resolve them\n- Permissions: Keycloak groups/roles and what they unlock\n\n## What to avoid (low value / fluff)\n- Generic Kubernetes explanations (link to upstream docs instead)\n- Copy-pasting large manifests (prefer file paths + small snippets)\n- Anything that will drift quickly (render it from GitOps instead)\n- Any secret values (reference Secret/Vault locations by name only)\n\n## Document pattern (recommended)\nEach runbook should answer:\n- \u201cWhat is this?\u201d\n- \u201cWhat do users do?\u201d\n- \u201cWhat do operators change (where in Git)?\u201d\n- \u201cHow do we verify it works?\u201d\n- \u201cWhat breaks and how to debug it?\u201d"
  },
  {
    "path": "runbooks/observability.md",
    "title": "Observability: Grafana + VictoriaMetrics (how to query safely)",
    "tags": [
      "atlas",
      "monitoring",
      "grafana",
      "victoriametrics"
    ],
    "entrypoints": [
      "metrics.bstein.dev",
      "alerts.bstein.dev"
    ],
    "source_paths": [
      "services/monitoring"
    ],
    "body": "# Observability: Grafana + VictoriaMetrics (how to query safely)\n\n## Where it is configured\n- `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)\n- `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)\n\n## Using metrics as a \u201ctool\u201d for Atlas assistants\nThe safest pattern is: map a small set of intents \u2192 fixed PromQL queries, then summarize results.\n\nExamples (intents)\n- \u201cIs the cluster healthy?\u201d \u2192 node readiness + pod restart rate\n- \u201cWhy is Element Call failing?\u201d \u2192 LiveKit/coturn pod restarts + synapse errors + ingress 5xx\n- \u201cIs Jenkins slow?\u201d \u2192 pod CPU/memory + HTTP latency metrics (if exported)\n\n## Why dashboards are not the KB\nDashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the\nKB focused on wiring, runbooks, and stable conventions."
  },
  {
    "path": "runbooks/template.md",
    "title": "<short title>",
    "tags": [
      "atlas",
      "<service>",
      "<topic>"
    ],
    "entrypoints": [
      "<hostnames if relevant>"
    ],
    "source_paths": [
      "services/<svc>",
      "clusters/atlas/<...>"
    ],
    "body": "# <Short title>\n\n## What this is\n\n## For users (how to)\n\n## For operators (where configured)\n\n## Troubleshooting (symptoms \u2192 checks)"
  }
 ]
--- a/knowledge/diagrams/atlas-http.mmd
+++ b/knowledge/diagrams/atlas-http.mmd
@ -0,0 +1,189 @@
 flowchart LR
  host_auth_bstein_dev["auth.bstein.dev"]
  svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
  host_auth_bstein_dev --> svc_sso_oauth2_proxy
  wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
  svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
  host_bstein_dev["bstein.dev"]
  svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
  wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
  svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
  svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
  host_bstein_dev --> svc_comms_matrix_wellknown
  wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
  svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
  svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
  wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
  svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
  host_call_live_bstein_dev["call.live.bstein.dev"]
  svc_comms_element_call["comms/element-call (Service)"]
  host_call_live_bstein_dev --> svc_comms_element_call
  wl_comms_element_call["comms/element-call (Deployment)"]
  svc_comms_element_call --> wl_comms_element_call
  host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
  svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
  host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
  wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
  svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
  host_ci_bstein_dev["ci.bstein.dev"]
  svc_jenkins_jenkins["jenkins/jenkins (Service)"]
  host_ci_bstein_dev --> svc_jenkins_jenkins
  wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
  svc_jenkins_jenkins --> wl_jenkins_jenkins
  host_cloud_bstein_dev["cloud.bstein.dev"]
  svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
  host_cloud_bstein_dev --> svc_nextcloud_nextcloud
  wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
  svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
  host_kit_live_bstein_dev["kit.live.bstein.dev"]
  svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
  host_kit_live_bstein_dev --> svc_comms_livekit_token_service
  wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
  svc_comms_livekit_token_service --> wl_comms_livekit_token_service
  svc_comms_livekit["comms/livekit (Service)"]
  host_kit_live_bstein_dev --> svc_comms_livekit
  wl_comms_livekit["comms/livekit (Deployment)"]
  svc_comms_livekit --> wl_comms_livekit
  host_live_bstein_dev["live.bstein.dev"]
  svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
  host_live_bstein_dev --> svc_comms_othrys_element_element_web
  wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
  svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
  host_live_bstein_dev --> svc_comms_matrix_wellknown
  svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
  host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
  wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
  svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
  host_longhorn_bstein_dev["longhorn.bstein.dev"]
  svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
  host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
  wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
  svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
  host_mail_bstein_dev["mail.bstein.dev"]
  svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
  host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
  host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
  host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
  host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
  host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
  host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
  host_monero_bstein_dev["monero.bstein.dev"]
  svc_crypto_monerod["crypto/monerod (Service)"]
  host_monero_bstein_dev --> svc_crypto_monerod
  wl_crypto_monerod["crypto/monerod (Deployment)"]
  svc_crypto_monerod --> wl_crypto_monerod
  host_office_bstein_dev["office.bstein.dev"]
  svc_nextcloud_collabora["nextcloud/collabora (Service)"]
  host_office_bstein_dev --> svc_nextcloud_collabora
  wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
  svc_nextcloud_collabora --> wl_nextcloud_collabora
  host_pegasus_bstein_dev["pegasus.bstein.dev"]
  svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
  host_pegasus_bstein_dev --> svc_jellyfin_pegasus
  wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
  svc_jellyfin_pegasus --> wl_jellyfin_pegasus
  host_scm_bstein_dev["scm.bstein.dev"]
  svc_gitea_gitea["gitea/gitea (Service)"]
  host_scm_bstein_dev --> svc_gitea_gitea
  wl_gitea_gitea["gitea/gitea (Deployment)"]
  svc_gitea_gitea --> wl_gitea_gitea
  host_secret_bstein_dev["secret.bstein.dev"]
  svc_vault_vault["vault/vault (Service)"]
  host_secret_bstein_dev --> svc_vault_vault
  wl_vault_vault["vault/vault (StatefulSet)"]
  svc_vault_vault --> wl_vault_vault
  host_sso_bstein_dev["sso.bstein.dev"]
  svc_sso_keycloak["sso/keycloak (Service)"]
  host_sso_bstein_dev --> svc_sso_keycloak
  wl_sso_keycloak["sso/keycloak (Deployment)"]
  svc_sso_keycloak --> wl_sso_keycloak
  host_stream_bstein_dev["stream.bstein.dev"]
  svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
  host_stream_bstein_dev --> svc_jellyfin_jellyfin
  wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
  svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
  host_vault_bstein_dev["vault.bstein.dev"]
  svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
  host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
  wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
  svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
  subgraph bstein_dev_home[bstein-dev-home]
    svc_bstein_dev_home_bstein_dev_home_frontend
    wl_bstein_dev_home_bstein_dev_home_frontend
    svc_bstein_dev_home_bstein_dev_home_backend
    wl_bstein_dev_home_bstein_dev_home_backend
    svc_bstein_dev_home_chat_ai_gateway
    wl_bstein_dev_home_chat_ai_gateway
  end
  subgraph comms[comms]
    svc_comms_matrix_wellknown
    wl_comms_matrix_wellknown
    svc_comms_element_call
    wl_comms_element_call
    svc_comms_livekit_token_service
    wl_comms_livekit_token_service
    svc_comms_livekit
    wl_comms_livekit
    svc_comms_othrys_element_element_web
    wl_comms_othrys_element_element_web
    svc_comms_othrys_synapse_matrix_synapse
    wl_comms_othrys_synapse_matrix_synapse
    svc_comms_matrix_authentication_service
    wl_comms_matrix_authentication_service
    svc_comms_matrix_guest_register
    wl_comms_matrix_guest_register
  end
  subgraph crypto[crypto]
    svc_crypto_monerod
    wl_crypto_monerod
  end
  subgraph gitea[gitea]
    svc_gitea_gitea
    wl_gitea_gitea
  end
  subgraph jellyfin[jellyfin]
    svc_jellyfin_pegasus
    wl_jellyfin_pegasus
    svc_jellyfin_jellyfin
    wl_jellyfin_jellyfin
  end
  subgraph jenkins[jenkins]
    svc_jenkins_jenkins
    wl_jenkins_jenkins
  end
  subgraph longhorn_system[longhorn-system]
    svc_longhorn_system_oauth2_proxy_longhorn
    wl_longhorn_system_oauth2_proxy_longhorn
  end
  subgraph mailu_mailserver[mailu-mailserver]
    svc_mailu_mailserver_mailu_front
  end
  subgraph nextcloud[nextcloud]
    svc_nextcloud_nextcloud
    wl_nextcloud_nextcloud
    svc_nextcloud_collabora
    wl_nextcloud_collabora
  end
  subgraph sso[sso]
    svc_sso_oauth2_proxy
    wl_sso_oauth2_proxy
    svc_sso_keycloak
    wl_sso_keycloak
  end
  subgraph vault[vault]
    svc_vault_vault
    wl_vault_vault
  end
  subgraph vaultwarden[vaultwarden]
    svc_vaultwarden_vaultwarden_service
    wl_vaultwarden_vaultwarden
  end
--- a/knowledge/metis.md
+++ b/knowledge/metis.md
@ -0,0 +1,26 @@
 # Metis (node recovery)
 ## Node classes (current map)
 - rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
 - rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
 - rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
 - rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
 - amd64 agents: titan-22/24 (Debian 13, k3s agent)
 - External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, future titan-20/21 (when added), plus any newcomers.
 ## Longhorn disk UUIDs (critical nodes)
 - titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
 - titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
 - titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
 - titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
 ## Metis repo (~/Development/metis)
 - CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
 - `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
 - `AGENTS.md` in repo is untracked and holds raw notes.
 ## Next implementation steps
 - Add per-class golden image refs and checksums (Harbor or file://) when ready.
 - Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
 - Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
 - Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
--- a/knowledge/runbooks/ci-gitea-jenkins.md
+++ b/knowledge/runbooks/ci-gitea-jenkins.md
@ -0,0 +1,27 @@
 ---
 title: "CI: Gitea → Jenkins pipeline"
 tags: ["atlas", "ci", "gitea", "jenkins"]
 owners: ["brad"]
 entrypoints: ["scm.bstein.dev", "ci.bstein.dev"]
 source_paths: ["services/gitea", "services/jenkins", "scripts/jenkins_cred_sync.sh", "scripts/gitea_cred_sync.sh"]
 ---
 # CI: Gitea → Jenkins pipeline
 ## What this is
 Atlas uses Gitea for source control and Jenkins for CI. Authentication is via Keycloak (SSO).
 ## Where it is configured
 - Gitea manifests: `services/gitea/`
 - Jenkins manifests: `services/jenkins/`
 - Credential sync helpers: `scripts/gitea_cred_sync.sh`, `scripts/jenkins_cred_sync.sh`
 ## What users do (typical flow)
 - Create a repo in Gitea.
 - Create/update a Jenkins job/pipeline that can fetch the repo.
 - Configure a webhook (or SCM polling) so pushes trigger builds.
 ## Troubleshooting (common)
 - “Webhook not firing”: confirm ingress host, webhook URL, and Jenkins job is reachable.
 - “Auth denied cloning”: confirm Keycloak group membership and that Jenkins has a valid token/credential configured.
--- a/knowledge/runbooks/comms-verify.md
+++ b/knowledge/runbooks/comms-verify.md
@ -0,0 +1,30 @@
 ---
 title: Othrys verification checklist
 tags:
  - comms
  - matrix
  - element
  - livekit
 entrypoints:
  - https://live.bstein.dev
  - https://matrix.live.bstein.dev
 ---
 1) Guest join:
 - Open a private window and visit:
  `https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join`
 - Confirm the guest join flow works and the displayname becomes `<word>-<word>`.
 2) Keycloak login:
 - Log in from `https://live.bstein.dev` and confirm MAS -> Keycloak -> Element redirect.
 3) Video rooms:
 - Start an Element Call room and confirm audio/video with a second account.
 - Check that guests can read public rooms but cannot start calls.
 4) Well-known:
 - `https://live.bstein.dev/.well-known/matrix/client` returns JSON.
 - `https://matrix.live.bstein.dev/.well-known/matrix/client` returns JSON.
 5) TURN reachability:
 - Confirm `turn.live.bstein.dev:3478` and `turns:5349` are reachable from WAN.
--- a/knowledge/runbooks/kb-authoring.md
+++ b/knowledge/runbooks/kb-authoring.md
@ -0,0 +1,34 @@
 ---
 title: "KB authoring: what to write (and what not to)"
 tags: ["atlas", "kb", "runbooks"]
 owners: ["brad"]
 entrypoints: []
 source_paths: ["knowledge/runbooks", "scripts/knowledge_render_atlas.py"]
 ---
 # KB authoring: what to write (and what not to)
 ## The goal
 Give Atlas assistants enough grounded, Atlas-specific context to answer “how do I…?” questions without guessing.
 ## What to capture (high value)
 - User workflows: “click here, set X, expected result”
 - Operator workflows: “edit these files, reconcile this kustomization, verify with these commands”
 - Wiring: “this host routes to this service; this service depends on Postgres/Vault/etc”
 - Failure modes: exact error messages + the 2–5 checks that usually resolve them
 - Permissions: Keycloak groups/roles and what they unlock
 ## What to avoid (low value / fluff)
 - Generic Kubernetes explanations (link to upstream docs instead)
 - Copy-pasting large manifests (prefer file paths + small snippets)
 - Anything that will drift quickly (render it from GitOps instead)
 - Any secret values (reference Secret/Vault locations by name only)
 ## Document pattern (recommended)
 Each runbook should answer:
 - “What is this?”
 - “What do users do?”
 - “What do operators change (where in Git)?”
 - “How do we verify it works?”
 - “What breaks and how to debug it?”
--- a/knowledge/runbooks/observability.md
+++ b/knowledge/runbooks/observability.md
@ -0,0 +1,26 @@
 ---
 title: "Observability: Grafana + VictoriaMetrics (how to query safely)"
 tags: ["atlas", "monitoring", "grafana", "victoriametrics"]
 owners: ["brad"]
 entrypoints: ["metrics.bstein.dev", "alerts.bstein.dev"]
 source_paths: ["services/monitoring"]
 ---
 # Observability: Grafana + VictoriaMetrics (how to query safely)
 ## Where it is configured
 - `services/monitoring/helmrelease.yaml` (Grafana + Alertmanager + VM values)
 - `services/monitoring/grafana-dashboard-*.yaml` (dashboards and their PromQL)
 ## Using metrics as a “tool” for Atlas assistants
 The safest pattern is: map a small set of intents → fixed PromQL queries, then summarize results.
 Examples (intents)
 - “Is the cluster healthy?” → node readiness + pod restart rate
 - “Why is Element Call failing?” → LiveKit/coturn pod restarts + synapse errors + ingress 5xx
 - “Is Jenkins slow?” → pod CPU/memory + HTTP latency metrics (if exported)
 ## Why dashboards are not the KB
 Dashboards are great references, but the assistant should query VictoriaMetrics directly for live answers and keep the
 KB focused on wiring, runbooks, and stable conventions.
--- a/knowledge/runbooks/template.md
+++ b/knowledge/runbooks/template.md
@ -0,0 +1,18 @@
 ---
 title: "<short title>"
 tags: ["atlas", "<service>", "<topic>"]
 owners: ["brad"]
 entrypoints: ["<hostnames if relevant>"]
 source_paths: ["services/<svc>", "clusters/atlas/<...>"]
 ---
 # <Short title>
 ## What this is
 ## For users (how to)
 ## For operators (where configured)
 ## Troubleshooting (symptoms → checks)
--- a/knowledge/software/metis.md
+++ b/knowledge/software/metis.md
@ -0,0 +1,73 @@
 # Metis (node recovery)
 ## Node classes (current map)
 - rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
 - rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
 - rpi4 Armbian longhorn: titan-13/15/17/19 (Armbian 6.6.x, k3s agent, longhorn disks)
 - rpi4 Armbian standard: titan-12/14/18 (Armbian 6.6.x, k3s agent)
 - amd64 agents: titan-22/24 (Debian 13, k3s agent)
 - External/non-cluster: tethys, titan-db, titan-jh, oceanus/titan-23, plus any newcomers.
 ### Jetson nodes (titan-20/21)
 - Ubuntu 20.04.6 (Focal), kernel 5.10.104-tegra, CRI containerd 2.0.5-k3s2, arch arm64.
 - Storage: NVMe 232G at / (ext4); onboard mmc partitions present but root on NVMe; 1.9T sda present (unused).
 - k3s agent with drop-in 99-nofile.conf.
 ## Longhorn disk UUIDs (critical nodes)
 - titan-13: /mnt/astreae UUID=6031fa8b-f28c-45c3-b7bc-6133300e07c6 (ext4); /mnt/asteria UUID=cbd4989d-62b5-4741-8b2a-28fdae259cae (ext4)
 - titan-15: /mnt/astreae UUID=f3362f14-5822-449f-944b-ac570b5cd615 (ext4); /mnt/asteria UUID=9c5316e6-f847-4884-b502-11f2d0d15d6f (ext4)
 - titan-17: /mnt/astreae UUID=1fecdade-08b0-49cb-9ae3-be6c188b0a96 (ext4); /mnt/asteria UUID=2fe9f613-d372-47ca-b84f-82084e4edda0 (ext4)
 - titan-19: /mnt/astreae UUID=4890abb9-dda2-4f4f-9c0f-081ee82849cf (ext4); /mnt/asteria UUID=2b4ea28d-b0e6-4fa3-841b-cd7067ae9153 (ext4)
 ## Metis repo (~/Development/metis)
 - CLI skeleton in Go (`cmd/metis`), inventory loader (`pkg/inventory`), plan builder (`pkg/plan`).
 - `inventory.example.yaml` shows expected schema (classes + per-node overlay, Longhorn disks, labels, taints).
 - `AGENTS.md` in repo is untracked and holds raw notes.
 ## Next implementation steps
 - Add per-class golden image refs and checksums (Harbor or file://) when ready.
 - Implement burn execution: download with checksum, write via dd/etcher-equivalent, mount boot/root to inject hostname/IP/k3s tokens/labels/taints, journald/GC drop-ins, and Longhorn fstab entries. Add Windows writer (diskpart + wmic) and Linux writer (dd + sgdisk) paths.
 - Add Keycloak/SSH bootstrap: ensure ssh user, authorized keys, and k3s token/URL injection for agents; control-plane restore path with etcd snapshot selection.
 - Add per-host inventory entries for tethys, titan-db, titan-jh, oceanus/titan-23, future 20/21 once audited.
 ## Node OS/Kernel/CRI snapshot (Jan 2026)
 - titan-04: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-05: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-06: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-07: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-08: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-09: Ubuntu 24.04.3 LTS, kernel 6.8.0-1031-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-0a: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-0b: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-0c: Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-10: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-11: Ubuntu 24.04.3 LTS, kernel 6.8.0-1039-raspi, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-12: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-13: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-14: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-15: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-17: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-18: Armbian 24.11.1 noble, kernel 6.6.60-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-19: Armbian 25.2.1 noble, kernel 6.6.63-current-bcm2711, CRI containerd://1.7.23-k3s2, arch arm64
 - titan-20: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-21: Ubuntu 20.04.6 LTS, kernel 5.10.104-tegra, CRI containerd://2.0.5-k3s2, arch arm64
 - titan-22: Debian 13 (trixie), kernel 6.12.41+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
 - titan-24: Debian 13 (trixie), kernel 6.12.57+deb13-amd64, CRI containerd://2.0.5-k3s2, arch amd64
 ### External hosts
 - titan-db: Ubuntu 24.10, kernel 6.11.0-1015-raspi, root on /dev/sda2 ext4 (465G), boot vfat /dev/sda1; PostgreSQL service enabled.
 - titan-jh: Arch Linux ARM (rolling), kernel 6.18.4-2-rpi, NVMe root ext4 238G (/), boot vfat 512M; ~495 packages installed (pacman -Q).
 - titan-23/oceanus: TODO audit (future).
 ### Control plane Pis (titan-0a/0b/0c)
 - Ubuntu 24.04.1 LTS, kernel 6.8.0-1038-raspi, containerd 2.0.5-k3s2.
 - Storage: 477G SSD root (/dev/sda2 ext4), /boot/firmware vfat (/dev/sda1). fstab uses LABEL=writable and LABEL=system-boot.
 - k3s server (control-plane taint expected); etcd snapshots not yet cataloged (TODO).
 ## k3s versions
 - rpi5 workers/control-plane: k3s v1.33.3+k3s1 (crictl v1.31.0-k3s2)
 - rpi4 nodes: k3s v1.31.5+k3s1 (crictl v1.31.0-k3s2)
 - Jetson titan-20/21: k3s v1.33.3+k3s1 (per node info), crictl v1.31.0-k3s2
--- a/scripts/comms_sync_kb.sh
+++ b/scripts/comms_sync_kb.sh
@ -0,0 +1,5 @@
 #!/usr/bin/env bash
 set -euo pipefail
 python scripts/knowledge_render_atlas.py --write
 python scripts/knowledge_render_atlas.py --write --out services/comms/knowledge
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -9,6 +9,7 @@ Usage:
 import argparse
 import json
 import textwrap
 import urllib.parse
 from pathlib import Path
 # ---------------------------------------------------------------------------
@ -45,12 +46,14 @@ PERCENT_THRESHOLDS = {
    ],
 }
 NAMESPACE_CPU_WINDOW = "1m"
 # ---------------------------------------------------------------------------
 # Cluster metadata
 # ---------------------------------------------------------------------------
 CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
-CONTROL_DEPENDENCIES = ["titan-db"]
+CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"]
 CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
 WORKER_NODES = [
    "titan-04",
@ -61,11 +64,12 @@ WORKER_NODES = [
    "titan-09",
    "titan-10",
    "titan-11",
    "titan-20",
    "titan-21",
    "titan-12",
    "titan-13",
    "titan-14",
    "titan-15",
    "titan-16",
    "titan-17",
    "titan-18",
    "titan-19",
@ -80,7 +84,22 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
 WORKER_TOTAL = len(WORKER_NODES)
 CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
 WORKER_SUFFIX = f"/{WORKER_TOTAL}"
-CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
+# Namespaces considered infrastructure (excluded from workload counts)
 INFRA_NAMESPACES = [
    "kube-system",
    "longhorn-system",
    "metallb-system",
    "monitoring",
    "logging",
    "cert-manager",
    "flux-system",
    "traefik",
    "maintenance",
    "postgres",
 ]
 INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
 # Namespaces allowed on control plane without counting as workloads
 CP_ALLOWED_NS = INFRA_REGEX
 LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
 GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
 CONTROL_WORKLOADS_EXPR = (
@ -170,22 +189,48 @@ def node_io_expr(scope=""):
    return scoped_node_expr(base, scope)
 def namespace_selector(scope_var):
    return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}'
 def namespace_gpu_selector(scope_var):
    return f'namespace!="",pod!="",{scope_var}'
 def namespace_cpu_raw(scope_var):
    return (
        "sum(rate(container_cpu_usage_seconds_total"
        f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)"
    )
 def namespace_ram_raw(scope_var):
    return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
 def namespace_gpu_usage_instant(scope_var):
    return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
 def namespace_share_expr(resource_expr):
-    selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
+    total = f"clamp_min(sum( {resource_expr} ), 1)"
-    total = f"clamp_min(sum( {selected} ), 1)"
+    return f"100 * ( {resource_expr} ) / {total}"
    return f"100 * ( {selected} ) / {total}"
-def namespace_cpu_share_expr():
+def namespace_cpu_share_expr(scope_var):
-    return namespace_share_expr(NAMESPACE_CPU_RAW)
+    return namespace_share_expr(namespace_cpu_raw(scope_var))
-def namespace_ram_share_expr():
+def namespace_ram_share_expr(scope_var):
-    return namespace_share_expr(NAMESPACE_RAM_RAW)
+    return namespace_share_expr(namespace_ram_raw(scope_var))
-def namespace_gpu_share_expr():
+def namespace_gpu_share_expr(scope_var):
-    return namespace_share_expr(NAMESPACE_GPU_RAW)
+    usage = namespace_gpu_usage_instant(scope_var)
    total = f"(sum({usage}) or on() vector(0))"
    share = f"100 * ({usage}) / clamp_min({total}, 1)"
    idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
    return f"({share}) or ({idle})"
 PROBLEM_PODS_EXPR = (
@ -270,46 +315,12 @@ STUCK_TABLE_EXPR = (
    ")"
 )
-NAMESPACE_CPU_RAW = (
+NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
-    'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
+NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
-)
+NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
-NAMESPACE_RAM_RAW = (
+NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
    'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
 )
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 NAMESPACE_GPU_ALLOC = (
    'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
 )
 NAMESPACE_GPU_USAGE_SHARE = (
    'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
 )
 NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
 NAMESPACE_GPU_RAW = (
    "("
    + NAMESPACE_GPU_USAGE_SHARE
    + ") or on(namespace) ("
    + NAMESPACE_CPU_RAW
    + " * 0)"
 )
 NAMESPACE_GPU_WEIGHT = (
    "("
    + NAMESPACE_GPU_ALLOC
    + ") or on(namespace) ("
    + NAMESPACE_CPU_RAW
    + " * 0)"
 )
 NAMESPACE_ACTIVITY_SCORE = (
    "( "
    + NAMESPACE_CPU_RAW
    + " ) + ("
    + NAMESPACE_RAM_RAW
    + " / 1e9) + ("
    + NAMESPACE_GPU_WEIGHT
    + " * 100)"
 )
 NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 TRAEFIK_NET_INGRESS = (
    'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
@ -560,9 +571,9 @@ def table_panel(
    return panel
-def pie_panel(panel_id, title, expr, grid):
+def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
    """Return a pie chart panel with readable namespace labels."""
-    return {
+    panel = {
        "id": panel_id,
        "type": "piechart",
        "title": title,
@ -586,6 +597,71 @@ def pie_panel(panel_id, title, expr, grid):
            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
        },
    }
    if links:
        panel["links"] = links
    if description:
        panel["description"] = description
    return panel
 def namespace_scope_variable(var_name, label):
    options = [
        {
            "text": "workload namespaces only",
            "value": NAMESPACE_SCOPE_WORKLOAD,
            "selected": True,
        },
        {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
        {
            "text": "infrastructure namespaces only",
            "value": NAMESPACE_SCOPE_INFRA,
            "selected": False,
        },
    ]
    query = (
        "workload namespaces only : "
        + NAMESPACE_SCOPE_WORKLOAD
        + ",all namespaces : "
        + NAMESPACE_SCOPE_ALL
        + ",infrastructure namespaces only : "
        + NAMESPACE_SCOPE_INFRA
    )
    return {
        "name": var_name,
        "label": label,
        "type": "custom",
        "query": query,
        "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
        "options": options,
        "hide": 2,
        "multi": False,
        "includeAll": False,
        "refresh": 1,
        "sort": 0,
        "skipUrlSync": False,
    }
 def namespace_scope_links(var_name):
    def with_value(value):
        encoded = urllib.parse.quote(value, safe="")
        params = []
        for other in NAMESPACE_SCOPE_VARS:
            if other == var_name:
                params.append(f"var-{other}={encoded}")
            else:
                params.append(f"var-{other}=${{{other}}}")
        return "?" + "&".join(params)
    return [
        {"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
        {"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
        {
            "title": "Infrastructure namespaces only",
            "url": with_value(NAMESPACE_SCOPE_INFRA),
            "targetBlank": False,
        },
    ]
 def bargauge_panel(
@ -857,6 +933,115 @@ def build_overview():
            )
        )
    mail_bounce_rate_thresholds = {
        "mode": "absolute",
        "steps": [
            {"color": "green", "value": None},
            {"color": "yellow", "value": 5},
            {"color": "orange", "value": 8},
            {"color": "red", "value": 10},
        ],
    }
    mail_limit_thresholds = {
        "mode": "absolute",
        "steps": [
            {"color": "green", "value": None},
            {"color": "yellow", "value": 70},
            {"color": "orange", "value": 85},
            {"color": "red", "value": 95},
        ],
    }
    mail_success_thresholds = {
        "mode": "absolute",
        "steps": [
            {"color": "red", "value": None},
            {"color": "orange", "value": 90},
            {"color": "yellow", "value": 95},
            {"color": "green", "value": 98},
        ],
    }
    panels.append(
        stat_panel(
            30,
            "Mail Sent (1d)",
            'max(postmark_outbound_sent{window="1d"})',
            {"h": 2, "w": 6, "x": 0, "y": 8},
            unit="none",
            links=link_to("atlas-mail"),
        )
    )
    panels.append(
        {
            "id": 31,
            "type": "stat",
            "title": "Mail Bounces (1d)",
            "datasource": PROM_DS,
            "gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
            "targets": [
                {
                    "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
                    "refId": "A",
                    "legendFormat": "Rate",
                },
                {
                    "expr": 'max(postmark_outbound_bounced{window="1d"})',
                    "refId": "B",
                    "legendFormat": "Count",
                },
            ],
            "fieldConfig": {
                "defaults": {
                    "color": {"mode": "thresholds"},
                    "custom": {"displayMode": "auto"},
                    "thresholds": mail_bounce_rate_thresholds,
                    "unit": "none",
                },
                "overrides": [
                    {
                        "matcher": {"id": "byName", "options": "Rate"},
                        "properties": [{"id": "unit", "value": "percent"}],
                    },
                    {
                        "matcher": {"id": "byName", "options": "Count"},
                        "properties": [{"id": "unit", "value": "none"}],
                    },
                ],
            },
            "options": {
                "colorMode": "value",
                "graphMode": "area",
                "justifyMode": "center",
                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
                "textMode": "name_and_value",
            },
            "links": link_to("atlas-mail"),
        }
    )
    panels.append(
        stat_panel(
            32,
            "Mail Success Rate (1d)",
            'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
            {"h": 2, "w": 6, "x": 6, "y": 8},
            unit="percent",
            thresholds=mail_success_thresholds,
            decimals=1,
            links=link_to("atlas-mail"),
        )
    )
    panels.append(
        stat_panel(
            33,
            "Mail Limit Used (30d)",
            "max(postmark_sending_limit_used_percent)",
            {"h": 2, "w": 6, "x": 18, "y": 8},
            unit="percent",
            thresholds=mail_limit_thresholds,
            decimals=1,
            links=link_to("atlas-mail"),
        )
    )
    storage_panels = [
        (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
        (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
@ -876,28 +1061,38 @@ def build_overview():
            )
        )
    cpu_scope = "$namespace_scope_cpu"
    gpu_scope = "$namespace_scope_gpu"
    ram_scope = "$namespace_scope_ram"
    panels.append(
        pie_panel(
            11,
            "Namespace CPU Share",
-            namespace_cpu_share_expr(),
+            namespace_cpu_share_expr(cpu_scope),
            {"h": 9, "w": 8, "x": 0, "y": 16},
            links=namespace_scope_links("namespace_scope_cpu"),
            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
        pie_panel(
            12,
            "Namespace GPU Share",
-            namespace_gpu_share_expr(),
+            namespace_gpu_share_expr(gpu_scope),
            {"h": 9, "w": 8, "x": 8, "y": 16},
            links=namespace_scope_links("namespace_scope_gpu"),
            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
        pie_panel(
            13,
            "Namespace RAM Share",
-            namespace_ram_share_expr(),
+            namespace_ram_share_expr(ram_scope),
            {"h": 9, "w": 8, "x": 16, "y": 16},
            links=namespace_scope_links("namespace_scope_ram"),
            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
@ -1052,7 +1247,6 @@ def build_overview():
            links=link_to("atlas-storage"),
        )
    )
    return {
        "uid": "atlas-overview",
        "title": "Atlas Overview",
@ -1063,7 +1257,13 @@ def build_overview():
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "overview"],
-        "templating": {"list": []},
+        "templating": {
            "list": [
                namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
                namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
                namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
            ]
        },
        "time": {"from": "now-1h", "to": "now"},
        "refresh": "1m",
        "links": [],
@ -1513,6 +1713,33 @@ def build_storage_dashboard():
            time_from="90d",
        )
    )
    panels.append(
        stat_panel(
            30,
            "Maintenance Sweepers Ready",
            'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
            {"h": 4, "w": 12, "x": 0, "y": 44},
            unit="percent",
            thresholds=PERCENT_THRESHOLDS,
        )
    )
    panels.append(
        stat_panel(
            31,
            "Maintenance Cron Freshness (s)",
            'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
            {"h": 4, "w": 12, "x": 12, "y": 44},
            unit="s",
            thresholds={
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "yellow", "value": 3600},
                    {"color": "red", "value": 10800},
                ],
            },
        )
    )
    return {
        "uid": "atlas-storage",
        "title": "Atlas Storage",
@ -1702,21 +1929,231 @@ def build_network_dashboard():
    }
 def build_mail_dashboard():
    panels = []
    bounce_rate_thresholds = {
        "mode": "absolute",
        "steps": [
            {"color": "green", "value": None},
            {"color": "yellow", "value": 5},
            {"color": "orange", "value": 8},
            {"color": "red", "value": 10},
        ],
    }
    limit_thresholds = {
        "mode": "absolute",
        "steps": [
            {"color": "green", "value": None},
            {"color": "yellow", "value": 70},
            {"color": "orange", "value": 85},
            {"color": "red", "value": 95},
        ],
    }
    success_thresholds = {
        "mode": "absolute",
        "steps": [
            {"color": "red", "value": None},
            {"color": "orange", "value": 90},
            {"color": "yellow", "value": 95},
            {"color": "green", "value": 98},
        ],
    }
    panels.append(
        stat_panel(
            1,
            "Sent (1d)",
            'max(postmark_outbound_sent{window="1d"})',
            {"h": 4, "w": 6, "x": 0, "y": 0},
            decimals=0,
        )
    )
    panels.append(
        stat_panel(
            2,
            "Sent (7d)",
            'max(postmark_outbound_sent{window="7d"})',
            {"h": 4, "w": 6, "x": 6, "y": 0},
            decimals=0,
        )
    )
    panels.append(
        {
            "id": 3,
            "type": "stat",
            "title": "Mail Bounces (1d)",
            "datasource": PROM_DS,
            "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
            "targets": [
                {
                    "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
                    "refId": "A",
                    "legendFormat": "Rate",
                },
                {
                    "expr": 'max(postmark_outbound_bounced{window="1d"})',
                    "refId": "B",
                    "legendFormat": "Count",
                },
            ],
            "fieldConfig": {
                "defaults": {
                    "color": {"mode": "thresholds"},
                    "custom": {"displayMode": "auto"},
                    "thresholds": bounce_rate_thresholds,
                    "unit": "none",
                },
                "overrides": [
                    {
                        "matcher": {"id": "byName", "options": "Rate"},
                        "properties": [{"id": "unit", "value": "percent"}],
                    },
                    {
                        "matcher": {"id": "byName", "options": "Count"},
                        "properties": [{"id": "unit", "value": "none"}],
                    },
                ],
            },
            "options": {
                "colorMode": "value",
                "graphMode": "area",
                "justifyMode": "center",
                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
                "textMode": "name_and_value",
            },
        }
    )
    panels.append(
        stat_panel(
            4,
            "Success Rate (1d)",
            'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
            {"h": 4, "w": 6, "x": 18, "y": 0},
            unit="percent",
            thresholds=success_thresholds,
            decimals=1,
        )
    )
    panels.append(
        stat_panel(
            5,
            "Limit Used (30d)",
            "max(postmark_sending_limit_used_percent)",
            {"h": 4, "w": 6, "x": 0, "y": 4},
            thresholds=limit_thresholds,
            unit="percent",
            decimals=1,
        )
    )
    panels.append(
        stat_panel(
            6,
            "Send Limit (30d)",
            "max(postmark_sending_limit)",
            {"h": 4, "w": 6, "x": 6, "y": 4},
            decimals=0,
        )
    )
    panels.append(
        stat_panel(
            7,
            "Last Success",
            "max(postmark_last_success_timestamp_seconds)",
            {"h": 4, "w": 6, "x": 12, "y": 4},
            unit="dateTimeAsIso",
            decimals=0,
        )
    )
    panels.append(
        stat_panel(
            8,
            "Exporter Errors",
            "sum(postmark_request_errors_total)",
            {"h": 4, "w": 6, "x": 18, "y": 4},
            decimals=0,
        )
    )
    panels.append(
        timeseries_panel(
            13,
            "Bounce Rate (1d vs 7d)",
            "max by (window) (postmark_outbound_bounce_rate)",
            {"h": 8, "w": 12, "x": 0, "y": 12},
            unit="percent",
            legend="{{window}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            14,
            "Bounced (1d vs 7d)",
            "max by (window) (postmark_outbound_bounced)",
            {"h": 8, "w": 12, "x": 12, "y": 12},
            unit="none",
            legend="{{window}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            15,
            "Sent (1d vs 7d)",
            "max by (window) (postmark_outbound_sent)",
            {"h": 8, "w": 12, "x": 0, "y": 20},
            unit="none",
            legend="{{window}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            16,
            "Exporter Errors",
            "sum(postmark_request_errors_total)",
            {"h": 8, "w": 12, "x": 12, "y": 20},
            unit="none",
        )
    )
    return {
        "uid": "atlas-mail",
        "title": "Atlas Mail",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
        "time": {"from": "now-30d", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "mail"],
    }
 def build_gpu_dashboard():
    panels = []
    gpu_scope = "$namespace_scope_gpu"
    panels.append(
        pie_panel(
            1,
            "Namespace GPU Share",
-            namespace_gpu_share_expr(),
+            namespace_gpu_share_expr(gpu_scope),
            {"h": 8, "w": 12, "x": 0, "y": 0},
            links=namespace_scope_links("namespace_scope_gpu"),
            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
        timeseries_panel(
            2,
            "GPU Util by Namespace",
-            NAMESPACE_GPU_USAGE_INSTANT,
+            namespace_gpu_usage_instant(gpu_scope),
            {"h": 8, "w": 12, "x": 12, "y": 0},
            unit="percent",
            legend="{{namespace}}",
@ -1757,6 +2194,13 @@ def build_gpu_dashboard():
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "gpu"],
        "templating": {
            "list": [
                namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
                namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
                namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
            ]
        },
    }
@ -1781,6 +2225,10 @@ DASHBOARDS = {
        "builder": build_network_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
    },
    "atlas-mail": {
        "builder": build_mail_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
    },
    "atlas-gpu": {
        "builder": build_gpu_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
--- a/scripts/dashboards_render_logs.py
+++ b/scripts/dashboards_render_logs.py
@ -0,0 +1,445 @@
 #!/usr/bin/env python3
 """Generate OpenSearch Dashboards saved objects and render them into ConfigMaps.
 Usage:
  scripts/dashboards_render_logs.py --build   # rebuild NDJSON + ConfigMap
  scripts/dashboards_render_logs.py           # re-render ConfigMap from NDJSON
 """
 from __future__ import annotations
 import argparse
 import json
 import textwrap
 from dataclasses import dataclass
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 DASHBOARD_DIR = ROOT / "services" / "logging" / "dashboards"
 NDJSON_PATH = DASHBOARD_DIR / "logs.ndjson"
 CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-dashboards-objects.yaml"
 CONFIG_TEMPLATE = textwrap.dedent(
    """# {relative_path}
 # Generated by scripts/dashboards_render_logs.py --build
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: opensearch-dashboards-objects
  namespace: logging
 data:
  objects.ndjson: |
 {payload}
 """
 )
 DASHBOARD_VERSION = "7.10.0"
 GRID_COLUMNS = 48
 H_CHART = 10
 H_ERRORS = 8
 H_TABLE = 16
 H_SEARCH = 18
 TABLE_SIZE = 15
 TABLE_PER_PAGE = 15
 ERROR_TERMS = ("*error*", "*exception*", "*fail*")
@dataclass(frozen=True)
 class AppSpec:
    slug: str
    title: str
    query: str
    index_id: str = "kube-logs"
    kind: str = "kube"
 def error_query(base: str | None = None) -> str:
    parts = [f'(log : "{term}" or message : "{term}")' for term in ERROR_TERMS]
    expr = " or ".join(parts)
    if base:
        return f"({base}) and ({expr})"
    return f"({expr})"
 def json_line(obj: dict) -> str:
    return json.dumps(obj, separators=(",", ":"))
 def search_source(query: str) -> dict:
    return {
        "query": {"language": "kuery", "query": query},
        "filter": [],
        "indexRefName": "kibanaSavedObjectMeta.searchSourceJSON.index",
    }
 def index_pattern(object_id: str, title: str, time_field: str = "@timestamp") -> dict:
    return {
        "type": "index-pattern",
        "id": object_id,
        "attributes": {"title": title, "timeFieldName": time_field},
    }
 def histogram_vis(object_id: str, title: str, query: str, index_id: str) -> dict:
    vis_state = {
        "title": title,
        "type": "histogram",
        "aggs": [
            {"id": "1", "enabled": True, "type": "count", "schema": "metric"},
            {
                "id": "2",
                "enabled": True,
                "type": "date_histogram",
                "schema": "segment",
                "params": {"field": "@timestamp", "interval": "auto", "min_doc_count": 1},
            },
        ],
        "params": {"addTooltip": True, "addLegend": False, "scale": "linear", "interpolate": "linear"},
    }
    return {
        "type": "visualization",
        "id": object_id,
        "attributes": {
            "title": title,
            "visState": json.dumps(vis_state, separators=(",", ":")),
            "uiStateJSON": "{}",
            "description": "",
            "version": 1,
            "kibanaSavedObjectMeta": {
                "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
            },
        },
        "references": [
            {
                "name": "kibanaSavedObjectMeta.searchSourceJSON.index",
                "type": "index-pattern",
                "id": index_id,
            }
        ],
    }
 def table_vis(object_id: str, title: str, field: str, query: str, index_id: str) -> dict:
    vis_state = {
        "title": title,
        "type": "table",
        "aggs": [
            {"id": "1", "enabled": True, "type": "count", "schema": "metric"},
            {
                "id": "2",
                "enabled": True,
                "type": "terms",
                "schema": "bucket",
                "params": {"field": field, "size": TABLE_SIZE, "order": "desc", "orderBy": "1"},
            },
        ],
        "params": {
            "perPage": TABLE_PER_PAGE,
            "showPartialRows": False,
            "showMetricsAtAllLevels": False,
            "sort": {"columnIndex": 1, "direction": "desc"},
        },
    }
    return {
        "type": "visualization",
        "id": object_id,
        "attributes": {
            "title": title,
            "visState": json.dumps(vis_state, separators=(",", ":")),
            "uiStateJSON": "{}",
            "description": "",
            "version": 1,
            "kibanaSavedObjectMeta": {
                "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
            },
        },
        "references": [
            {
                "name": "kibanaSavedObjectMeta.searchSourceJSON.index",
                "type": "index-pattern",
                "id": index_id,
            }
        ],
    }
 def search_object(object_id: str, title: str, columns: list[str], query: str, index_id: str) -> dict:
    return {
        "type": "search",
        "id": object_id,
        "attributes": {
            "title": title,
            "description": "",
            "columns": columns,
            "sort": [["@timestamp", "desc"]],
            "kibanaSavedObjectMeta": {
                "searchSourceJSON": json.dumps(search_source(query), separators=(",", ":"))
            },
        },
        "references": [
            {
                "name": "kibanaSavedObjectMeta.searchSourceJSON.index",
                "type": "index-pattern",
                "id": index_id,
            }
        ],
    }
 def grid(x: int, y: int, w: int, h: int, i: int) -> dict:
    return {"x": x, "y": y, "w": w, "h": h, "i": str(i)}
 def panel(panel_id: str, panel_type: str, grid_data: dict, index: int) -> dict:
    return {
        "panelIndex": str(index),
        "gridData": grid_data,
        "id": panel_id,
        "type": panel_type,
        "version": DASHBOARD_VERSION,
        "embeddableConfig": {},
    }
 def full_width_panels(specs: list[tuple[str, str, int]]) -> list[dict]:
    panels = []
    y = 0
    for index, (panel_id, panel_type, height) in enumerate(specs, start=1):
        panels.append(panel(panel_id, panel_type, grid(0, y, GRID_COLUMNS, height, index), index))
        y += height
    return panels
 def dashboard_object(object_id: str, title: str, panels: list[dict]) -> dict:
    return {
        "type": "dashboard",
        "id": object_id,
        "attributes": {
            "title": title,
            "description": "",
            "hits": 0,
            "panelsJSON": json.dumps(panels, separators=(",", ":")),
            "optionsJSON": json.dumps({"useMargins": True, "hidePanelTitles": False}, separators=(",", ":")),
            "version": 1,
            "timeRestore": False,
            "kibanaSavedObjectMeta": {
                "searchSourceJSON": json.dumps({"query": {"language": "kuery", "query": ""}, "filter": []})
            },
        },
    }
 def app_dashboard_objects(app: AppSpec) -> list[dict]:
    prefix = f"logs-{app.slug}"
    objects = []
    if app.kind == "journald":
        columns = ["@timestamp", "_HOSTNAME", "_SYSTEMD_UNIT", "MESSAGE"]
        objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
        objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
        objects.append(table_vis(f"{prefix}-top-units", "Top units", "_SYSTEMD_UNIT.keyword", app.query, app.index_id))
        objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
        objects.append(
            search_object(
                f"{prefix}-recent-errors",
                "Recent errors",
                columns,
                error_query(app.query),
                app.index_id,
            )
        )
        panels = full_width_panels(
            [
                (f"{prefix}-volume", "visualization", H_CHART),
                (f"{prefix}-errors", "visualization", H_ERRORS),
                (f"{prefix}-top-units", "visualization", H_TABLE),
                (f"{prefix}-recent", "search", H_SEARCH),
                (f"{prefix}-recent-errors", "search", H_SEARCH),
            ]
        )
        objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
        return objects
    columns = ["@timestamp", "kubernetes.pod_name", "kubernetes.container_name", "log", "message"]
    objects.append(histogram_vis(f"{prefix}-volume", f"{app.title} logs", app.query, app.index_id))
    objects.append(histogram_vis(f"{prefix}-errors", f"{app.title} errors", error_query(app.query), app.index_id))
    objects.append(table_vis(f"{prefix}-top-pods", "Top pods", "kubernetes.pod_name.keyword", app.query, app.index_id))
    objects.append(
        table_vis(f"{prefix}-top-containers", "Top containers", "kubernetes.container_name.keyword", app.query, app.index_id)
    )
    objects.append(search_object(f"{prefix}-recent", "Recent logs", columns, app.query, app.index_id))
    objects.append(
        search_object(
            f"{prefix}-recent-errors",
            "Recent errors",
            columns,
            error_query(app.query),
            app.index_id,
        )
    )
    panels = full_width_panels(
        [
            (f"{prefix}-volume", "visualization", H_CHART),
            (f"{prefix}-errors", "visualization", H_ERRORS),
            (f"{prefix}-top-pods", "visualization", H_TABLE),
            (f"{prefix}-top-containers", "visualization", H_TABLE),
            (f"{prefix}-recent", "search", H_SEARCH),
            (f"{prefix}-recent-errors", "search", H_SEARCH),
        ]
    )
    objects.append(dashboard_object(prefix, f"{app.title} Logs", panels))
    return objects
 def overview_objects() -> list[dict]:
    objects = []
    objects.append(histogram_vis("logs-overview-volume", "Logs per minute", "*", "kube-logs"))
    objects.append(histogram_vis("logs-overview-errors", "Errors per minute", error_query(), "kube-logs"))
    objects.append(
        table_vis(
            "logs-overview-top-ns",
            "Top namespaces",
            "kubernetes.namespace_name.keyword",
            "*",
            "kube-logs",
        )
    )
    objects.append(
        table_vis(
            "logs-overview-top-error-ns",
            "Top error namespaces",
            "kubernetes.namespace_name.keyword",
            error_query(),
            "kube-logs",
        )
    )
    objects.append(table_vis("logs-overview-top-pods", "Top pods", "kubernetes.pod_name.keyword", "*", "kube-logs"))
    objects.append(
        table_vis(
            "logs-overview-top-nodes",
            "Top nodes",
            "kubernetes.node_name.keyword",
            "*",
            "kube-logs",
        )
    )
    objects.append(
        search_object(
            "logs-overview-recent-errors",
            "Recent errors",
            ["@timestamp", "kubernetes.namespace_name", "kubernetes.pod_name", "log", "message"],
            error_query(),
            "kube-logs",
        )
    )
    panels = full_width_panels(
        [
            ("logs-overview-volume", "visualization", H_CHART),
            ("logs-overview-errors", "visualization", H_ERRORS),
            ("logs-overview-top-ns", "visualization", H_TABLE),
            ("logs-overview-top-error-ns", "visualization", H_TABLE),
            ("logs-overview-top-pods", "visualization", H_TABLE),
            ("logs-overview-top-nodes", "visualization", H_TABLE),
            ("logs-overview-recent-errors", "search", H_SEARCH),
        ]
    )
    objects.append(dashboard_object("logs-overview", "Atlas Logs Overview", panels))
    return objects
 def build_objects() -> list[dict]:
    objects = [
        index_pattern("kube-logs", "kube-*"),
        index_pattern("journald-logs", "journald-*"),
    ]
    objects.extend(overview_objects())
    apps = [
        AppSpec("bstein-dev-home", "bstein-dev-home", 'kubernetes.namespace_name: "bstein-dev-home"'),
        AppSpec(
            "pegasus",
            "pegasus",
            'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "pegasus"',
        ),
        AppSpec(
            "jellyfin",
            "jellyfin",
            'kubernetes.namespace_name: "jellyfin" and kubernetes.labels.app: "jellyfin"',
        ),
        AppSpec("vaultwarden", "vaultwarden", 'kubernetes.namespace_name: "vaultwarden"'),
        AppSpec("mailu", "mailu", 'kubernetes.namespace_name: "mailu-mailserver"'),
        AppSpec("nextcloud", "nextcloud", 'kubernetes.namespace_name: "nextcloud"'),
        AppSpec("gitea", "gitea", 'kubernetes.namespace_name: "gitea"'),
        AppSpec("jenkins", "jenkins", 'kubernetes.namespace_name: "jenkins"'),
        AppSpec("harbor", "harbor", 'kubernetes.namespace_name: "harbor"'),
        AppSpec("vault", "vault", 'kubernetes.namespace_name: "vault"'),
        AppSpec("keycloak", "keycloak", 'kubernetes.namespace_name: "sso"'),
        AppSpec("flux-system", "flux-system", 'kubernetes.namespace_name: "flux-system"'),
        AppSpec("comms", "comms", 'kubernetes.namespace_name: "comms"'),
        AppSpec(
            "element-web",
            "element-web",
            'kubernetes.namespace_name: "comms" and kubernetes.container_name: "element-web"',
        ),
        AppSpec(
            "element-call",
            "element-call",
            'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "element-call"',
        ),
        AppSpec(
            "matrix-synapse",
            "matrix-synapse",
            'kubernetes.namespace_name: "comms" and kubernetes.container_name: "synapse"',
        ),
        AppSpec(
            "livekit",
            "livekit",
            'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "livekit"',
        ),
        AppSpec(
            "coturn",
            "coturn",
            'kubernetes.namespace_name: "comms" and kubernetes.labels.app: "coturn"',
        ),
        AppSpec("lesavka", "lesavka", '_HOSTNAME: "titan-jh"', index_id="journald-logs", kind="journald"),
    ]
    for app in apps:
        objects.extend(app_dashboard_objects(app))
    return objects
 def write_ndjson(objects: list[dict], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = "\n".join(json_line(obj) for obj in objects)
    path.write_text(payload + "\n")
 def render_configmap(ndjson_path: Path, output_path: Path) -> None:
    payload_lines = ndjson_path.read_text().splitlines()
    payload = "\n".join("    " + line for line in payload_lines)
    relative_path = output_path.relative_to(ROOT)
    output_path.write_text(CONFIG_TEMPLATE.format(relative_path=relative_path, payload=payload))
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--build", action="store_true", help="Regenerate saved object NDJSON and ConfigMap")
    args = parser.parse_args()
    if args.build:
        objects = build_objects()
        write_ndjson(objects, NDJSON_PATH)
    if not NDJSON_PATH.exists():
        raise SystemExit(f"Missing NDJSON file: {NDJSON_PATH}. Run with --build first.")
    render_configmap(NDJSON_PATH, CONFIG_PATH)
 if __name__ == "__main__":
    main()
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@ -0,0 +1,554 @@
 #!/usr/bin/env python3
 """Render Atlas knowledge artifacts from Flux + kustomize manifests.
 Outputs (committed to git for stable diffs + RAG):
 - knowledge/catalog/*.yaml
 - knowledge/diagrams/*.mmd
 This is intentionally conservative:
 - never includes Secret objects
 - never includes secret values
 - keeps output deterministic (sorted)
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import subprocess
 import sys
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Iterable
 import yaml
 REPO_ROOT = Path(__file__).resolve().parents[1]
 CLUSTER_SCOPED_KINDS = {
    "Namespace",
    "Node",
    "CustomResourceDefinition",
    "ClusterRole",
    "ClusterRoleBinding",
    "StorageClass",
    "PersistentVolume",
    "MutatingWebhookConfiguration",
    "ValidatingWebhookConfiguration",
    "APIService",
 }
 INCLUDED_KINDS = {
    "Namespace",
    "Deployment",
    "StatefulSet",
    "DaemonSet",
    "Service",
    "Ingress",
    "IngressRoute",  # traefik
    "HelmRelease",  # only to harvest ingress hostnames from values
 }
 def _run(cmd: list[str], *, cwd: Path) -> str:
    res = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True, check=False)
    if res.returncode != 0:
        raise RuntimeError(
            f"Command failed ({res.returncode}): {' '.join(cmd)}\n{res.stderr.strip()}"
        )
    return res.stdout
 def kustomize_build(path: Path) -> str:
    rel = path.relative_to(REPO_ROOT)
    try:
        return _run(["kubectl", "kustomize", str(rel)], cwd=REPO_ROOT)
    except Exception as e:
        msg = str(e)
        if "is not in or below" in msg:
            # Repo uses configMapGenerators that reference ../../scripts/*.py.
            # Kustomize load restriction must be disabled for a full render.
            try:
                return _run(
                    ["kubectl", "kustomize", "--load-restrictor=LoadRestrictionsNone", str(rel)],
                    cwd=REPO_ROOT,
                )
            except Exception:
                pass
        return _run(["kustomize", "build", "--load-restrictor=LoadRestrictionsNone", str(rel)], cwd=REPO_ROOT)
 def _iter_docs(raw_yaml: str) -> Iterable[dict[str, Any]]:
    for doc in yaml.safe_load_all(raw_yaml):
        if not isinstance(doc, dict):
            continue
        kind = doc.get("kind")
        if kind == "List" and isinstance(doc.get("items"), list):
            for item in doc["items"]:
                if isinstance(item, dict):
                    yield item
            continue
        if kind:
            yield doc
 def _meta(doc: dict[str, Any]) -> tuple[str, str | None]:
    md = doc.get("metadata") or {}
    name = md.get("name") or ""
    namespace = md.get("namespace")
    return name, namespace
 def _is_namespaced(doc: dict[str, Any]) -> bool:
    kind = doc.get("kind") or ""
    return kind not in CLUSTER_SCOPED_KINDS
@dataclass(frozen=True)
 class FluxKustomization:
    name: str
    path: str
    target_namespace: str | None
 def find_flux_kustomizations() -> list[FluxKustomization]:
    """Find Flux Kustomization CRs under clusters/atlas/flux-system."""
    root = REPO_ROOT / "clusters" / "atlas" / "flux-system"
    items: list[FluxKustomization] = []
    for file in sorted(root.rglob("*.yaml")):
        raw = file.read_text()
        for doc in _iter_docs(raw):
            if doc.get("kind") != "Kustomization":
                continue
            api = str(doc.get("apiVersion") or "")
            if not api.startswith("kustomize.toolkit.fluxcd.io/"):
                continue
            name, _ = _meta(doc)
            spec = doc.get("spec") or {}
            path = spec.get("path")
            if not isinstance(path, str) or not path.strip():
                continue
            items.append(
                FluxKustomization(
                    name=name,
                    path=path.strip().lstrip("./"),
                    target_namespace=spec.get("targetNamespace"),
                )
            )
    return sorted(items, key=lambda k: k.name)
 def _safe_string_scan_for_hosts(value: Any) -> set[str]:
    """Best-effort host scan from HelmRelease values without chart rendering."""
    hosts: set[str] = set()
    if isinstance(value, str):
        for m in re.finditer(r"(?i)([a-z0-9-]+(?:\.[a-z0-9-]+)+)", value):
            host = m.group(1).lower()
            if host.endswith("bstein.dev"):
                hosts.add(host)
        return hosts
    if isinstance(value, list):
        for item in value:
            hosts |= _safe_string_scan_for_hosts(item)
        return hosts
    if isinstance(value, dict):
        for item in value.values():
            hosts |= _safe_string_scan_for_hosts(item)
        return hosts
    return hosts
 def _service_ports(svc: dict[str, Any]) -> list[dict[str, Any]]:
    spec = svc.get("spec") or {}
    out: list[dict[str, Any]] = []
    for p in spec.get("ports") or []:
        if not isinstance(p, dict):
            continue
        out.append(
            {
                "name": p.get("name"),
                "port": p.get("port"),
                "targetPort": p.get("targetPort"),
                "protocol": p.get("protocol", "TCP"),
            }
        )
    return out
 def _workload_labels(doc: dict[str, Any]) -> dict[str, str]:
    tpl = (doc.get("spec") or {}).get("template") or {}
    md = tpl.get("metadata") or {}
    labels = md.get("labels") or {}
    return {str(k): str(v) for k, v in labels.items()} if isinstance(labels, dict) else {}
 def _service_selector(doc: dict[str, Any]) -> dict[str, str]:
    spec = doc.get("spec") or {}
    sel = spec.get("selector") or {}
    return {str(k): str(v) for k, v in sel.items()} if isinstance(sel, dict) else {}
 def _selector_matches(selector: dict[str, str], labels: dict[str, str]) -> bool:
    if not selector:
        return False
    return all(labels.get(k) == v for k, v in selector.items())
 def _sanitize_node_id(text: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_]", "_", text)
 def extract_catalog(
    rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]],
 ) -> tuple[dict[str, Any], dict[str, Any], str]:
    """Build knowledge catalog + mermaid diagram from rendered docs."""
    # Index workloads and services for mapping.
    workloads: dict[tuple[str, str], dict[str, Any]] = {}
    services: dict[tuple[str, str], dict[str, Any]] = {}
    ingresses: list[dict[str, Any]] = []
    ingressroutes: list[dict[str, Any]] = []
    helmrelease_hosts: dict[str, list[str]] = {}
    for src, docs in rendered:
        for doc in docs:
            kind = doc.get("kind")
            if kind not in INCLUDED_KINDS:
                continue
            if kind == "Secret":
                continue
            name, namespace = _meta(doc)
            if _is_namespaced(doc) and not namespace and src.target_namespace:
                namespace = src.target_namespace
                doc = dict(doc)
                doc.setdefault("metadata", {})["namespace"] = namespace
            if kind in ("Deployment", "StatefulSet", "DaemonSet"):
                workloads[(namespace or "", name)] = {
                    "kind": kind,
                    "namespace": namespace or "",
                    "name": name,
                    "labels": _workload_labels(doc),
                    "serviceAccountName": ((doc.get("spec") or {}).get("template") or {})
                    .get("spec", {})
                    .get("serviceAccountName"),
                    "nodeSelector": ((doc.get("spec") or {}).get("template") or {})
                    .get("spec", {})
                    .get("nodeSelector", {}),
                    "images": sorted(
                        {
                            c.get("image")
                            for c in (
                                (((doc.get("spec") or {}).get("template") or {}).get("spec") or {}).get(
                                    "containers"
                                )
                                or []
                            )
                            if isinstance(c, dict) and c.get("image")
                        }
                    ),
                }
            elif kind == "Service":
                services[(namespace or "", name)] = {
                    "namespace": namespace or "",
                    "name": name,
                    "type": (doc.get("spec") or {}).get("type", "ClusterIP"),
                    "selector": _service_selector(doc),
                    "ports": _service_ports(doc),
                }
            elif kind == "Ingress":
                ingresses.append({"source": src.name, "doc": doc})
            elif kind == "IngressRoute":
                ingressroutes.append({"source": src.name, "doc": doc})
            elif kind == "HelmRelease":
                spec = doc.get("spec") or {}
                vals = spec.get("values") or {}
                hosts = sorted(_safe_string_scan_for_hosts(vals))
                if hosts:
                    helmrelease_hosts[f"{src.name}:{namespace or ''}/{name}"] = hosts
    # Map services to workloads.
    service_to_workloads: dict[tuple[str, str], list[dict[str, str]]] = {}
    for (ns, svc_name), svc in services.items():
        selector = svc.get("selector") or {}
        matches: list[dict[str, str]] = []
        for (w_ns, w_name), w in workloads.items():
            if w_ns != ns:
                continue
            if _selector_matches(selector, w.get("labels") or {}):
                matches.append({"kind": w["kind"], "name": w_name})
        service_to_workloads[(ns, svc_name)] = sorted(matches, key=lambda m: (m["kind"], m["name"]))
    # Extract HTTP endpoints.
    endpoints: list[dict[str, Any]] = []
    def add_endpoint(
        *,
        host: str,
        path: str,
        namespace: str,
        service: str,
        port: Any,
        source: str,
        kind: str,
        obj_name: str,
    ):
        wk = service_to_workloads.get((namespace, service), [])
        endpoints.append(
            {
                "host": host,
                "path": path,
                "backend": {
                    "namespace": namespace,
                    "service": service,
                    "port": port,
                    "workloads": wk,
                },
                "via": {"kind": kind, "name": obj_name, "source": source},
            }
        )
    for item in ingresses:
        doc = item["doc"]
        source = item["source"]
        name, namespace = _meta(doc)
        namespace = namespace or ""
        spec = doc.get("spec") or {}
        for rule in spec.get("rules") or []:
            if not isinstance(rule, dict):
                continue
            host = (rule.get("host") or "").strip()
            http = rule.get("http") or {}
            for p in http.get("paths") or []:
                if not isinstance(p, dict):
                    continue
                backend = (p.get("backend") or {}).get("service") or {}
                svc_name = backend.get("name")
                svc_port = (backend.get("port") or {}).get("number") or (backend.get("port") or {}).get("name")
                if not host or not svc_name:
                    continue
                add_endpoint(
                    host=host,
                    path=p.get("path") or "/",
                    namespace=namespace,
                    service=svc_name,
                    port=svc_port,
                    source=source,
                    kind="Ingress",
                    obj_name=name,
                )
    host_re = re.compile(r"Host\(`([^`]+)`\)")
    pathprefix_re = re.compile(r"PathPrefix\(`([^`]+)`\)")
    for item in ingressroutes:
        doc = item["doc"]
        source = item["source"]
        name, namespace = _meta(doc)
        namespace = namespace or ""
        spec = doc.get("spec") or {}
        for route in spec.get("routes") or []:
            if not isinstance(route, dict):
                continue
            match = route.get("match") or ""
            hosts = host_re.findall(match)
            pathprefixes = pathprefix_re.findall(match) or ["/"]
            for svc in route.get("services") or []:
                if not isinstance(svc, dict):
                    continue
                svc_name = svc.get("name")
                svc_port = svc.get("port")
                if not svc_name:
                    continue
                for host in hosts:
                    for pp in pathprefixes:
                        add_endpoint(
                            host=host,
                            path=pp,
                            namespace=namespace,
                            service=svc_name,
                            port=svc_port,
                            source=source,
                            kind="IngressRoute",
                            obj_name=name,
                        )
    endpoints = sorted(
        endpoints,
        key=lambda e: (
            e["host"],
            e["path"],
            e["backend"]["namespace"],
            e["backend"]["service"],
        ),
    )
    catalog = {
        "cluster": "atlas",
        "sources": [
            {"name": k.name, "path": k.path, "targetNamespace": k.target_namespace}
            for k, _ in rendered
        ],
        "workloads": sorted(
            list(workloads.values()),
            key=lambda w: (w["namespace"], w["kind"], w["name"]),
        ),
        "services": sorted(
            list(services.values()),
            key=lambda s: (s["namespace"], s["name"]),
        ),
        "http_endpoints": endpoints,
        "helmrelease_host_hints": {k: v for k, v in sorted(helmrelease_hosts.items())},
    }
    # Mermaid diagram: host -> service -> workload (grouped by namespace).
    ns_nodes: dict[str, list[str]] = {}
    lines: list[str] = ["flowchart LR"]
    edges: set[tuple[str, str]] = set()
    def ensure_ns_node(ns: str, node_id: str):
        ns_nodes.setdefault(ns, [])
        if node_id not in ns_nodes[ns]:
            ns_nodes[ns].append(node_id)
    host_nodes: dict[str, str] = {}
    for ep in endpoints:
        host = ep["host"]
        host_id = host_nodes.get(host)
        if not host_id:
            host_id = f"host_{_sanitize_node_id(host)}"
            host_nodes[host] = host_id
            lines.append(f'  {host_id}["{host}"]')
        ns = ep["backend"]["namespace"]
        svc = ep["backend"]["service"]
        svc_id = f"svc_{_sanitize_node_id(ns)}_{_sanitize_node_id(svc)}"
        if svc_id not in ns_nodes.get(ns, []):
            lines.append(f'  {svc_id}["{ns}/{svc} (Service)"]')
            ensure_ns_node(ns, svc_id)
        if (host_id, svc_id) not in edges:
            edges.add((host_id, svc_id))
            lines.append(f"  {host_id} --> {svc_id}")
        for w in ep["backend"]["workloads"]:
            w_id = f"wl_{_sanitize_node_id(ns)}_{_sanitize_node_id(w['name'])}"
            if w_id not in ns_nodes.get(ns, []):
                lines.append(f'  {w_id}["{ns}/{w["name"]} ({w["kind"]})"]')
                ensure_ns_node(ns, w_id)
            if (svc_id, w_id) not in edges:
                edges.add((svc_id, w_id))
                lines.append(f"  {svc_id} --> {w_id}")
    # Wrap namespace subgraphs at the end for stability (sorted namespaces).
    if ns_nodes:
        lines.append("")
        for ns in sorted(ns_nodes.keys()):
            lines.append(f"  subgraph { _sanitize_node_id(ns) }[{ns}]")
            for node_id in ns_nodes[ns]:
                lines.append(f"    {node_id}")
            lines.append("  end")
    diagram = "\n".join(lines).rstrip() + "\n"
    summary = {
        "counts": {
            "workloads": len(workloads),
            "services": len(services),
            "http_endpoints": len(endpoints),
            "helmrelease_host_hints": sum(len(v) for v in helmrelease_hosts.values()),
        }
    }
    return catalog, summary, diagram
 def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", default="knowledge", help="Output base directory (default: knowledge/)")
    ap.add_argument(
        "--write",
        action="store_true",
        help="Write generated files (otherwise just print a summary).",
    )
    args = ap.parse_args()
    out_dir = REPO_ROOT / args.out
    flux = find_flux_kustomizations()
    if not flux:
        print("No Flux Kustomizations found under clusters/atlas/flux-system.", file=sys.stderr)
        return 2
    rendered: list[tuple[FluxKustomization, list[dict[str, Any]]]] = []
    for k in flux:
        path = REPO_ROOT / k.path
        if not path.exists():
            continue
        raw = kustomize_build(path)
        docs = [d for d in _iter_docs(raw) if d.get("kind") != "Secret"]
        rendered.append((k, docs))
    rendered = sorted(rendered, key=lambda item: item[0].name)
    catalog, summary, diagram = extract_catalog(rendered)
    if not args.write:
        print(json.dumps(summary, indent=2, sort_keys=True))
        return 0
    (out_dir / "catalog").mkdir(parents=True, exist_ok=True)
    (out_dir / "diagrams").mkdir(parents=True, exist_ok=True)
    catalog_path = out_dir / "catalog" / "atlas.yaml"
    catalog_json_path = out_dir / "catalog" / "atlas.json"
    summary_path = out_dir / "catalog" / "atlas-summary.json"
    diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
    runbooks_json_path = out_dir / "catalog" / "runbooks.json"
    catalog_path.write_text(
        "# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
        + yaml.safe_dump(catalog, sort_keys=False),
        encoding="utf-8",
    )
    catalog_json_path.write_text(json.dumps(catalog, indent=2, sort_keys=False) + "\n", encoding="utf-8")
    summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
    diagram_path.write_text(diagram, encoding="utf-8")
    # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
    runbooks_dir = out_dir / "runbooks"
    runbooks: list[dict[str, Any]] = []
    if runbooks_dir.exists():
        for md_file in sorted(runbooks_dir.glob("*.md")):
            raw = md_file.read_text(encoding="utf-8")
            fm: dict[str, Any] = {}
            body = raw
            if raw.startswith("---\n"):
                try:
                    _, rest = raw.split("---\n", 1)
                    fm_raw, body = rest.split("\n---\n", 1)
                    fm = yaml.safe_load(fm_raw) or {}
                except Exception:
                    fm = {}
                    body = raw
            runbooks.append(
                {
                    "path": str(md_file.relative_to(out_dir)),
                    "title": fm.get("title") or md_file.stem,
                    "tags": fm.get("tags") or [],
                    "entrypoints": fm.get("entrypoints") or [],
                    "source_paths": fm.get("source_paths") or [],
                    "body": body.strip(),
                }
            )
    runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
    print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/logging_render_observability.py
+++ b/scripts/logging_render_observability.py
@ -0,0 +1,313 @@
 #!/usr/bin/env python3
 """Generate OpenSearch Observability seed objects and render them into ConfigMaps.
 Usage:
  scripts/logging_render_observability.py --build   # rebuild JSON + ConfigMap
  scripts/logging_render_observability.py           # re-render ConfigMap from JSON
 """
 from __future__ import annotations
 import argparse
 import json
 import textwrap
 from dataclasses import dataclass
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 OBS_DIR = ROOT / "services" / "logging" / "observability"
 APPS_PATH = OBS_DIR / "applications.json"
 QUERIES_PATH = OBS_DIR / "saved_queries.json"
 VIS_PATH = OBS_DIR / "saved_visualizations.json"
 CONFIG_PATH = ROOT / "services" / "logging" / "opensearch-observability-objects.yaml"
 CONFIG_TEMPLATE = textwrap.dedent(
    """# {relative_path}
 # Generated by scripts/logging_render_observability.py --build
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: opensearch-observability-objects
  namespace: logging
 data:
  applications.json: |
 {applications}
  saved_queries.json: |
 {queries}
  saved_visualizations.json: |
 {visualizations}
 """
 )
 DEFAULT_RANGE = {"start": "now-24h", "end": "now", "text": ""}
 DEFAULT_TIMESTAMP = {"name": "@timestamp", "type": "timestamp"}
 DEFAULT_FIELDS = {"text": "", "tokens": []}
@dataclass(frozen=True)
 class AppSpec:
    name: str
    base_query: str
    kind: str = "kube"
    description: str = ""
@dataclass(frozen=True)
 class QuerySpec:
    name: str
    query: str
    description: str = ""
@dataclass(frozen=True)
 class VisualizationSpec:
    name: str
    query: str
    vis_type: str
    description: str = ""
 def source_query(index: str, where: str | None = None) -> str:
    query = f"source = {index}"
    if where:
        query += f" | where {where}"
    return query
 def error_filter(fields: list[str]) -> str:
    parts = [f"match({field}, 'error|exception|fail')" for field in fields]
    return " or ".join(parts)
 def saved_query(spec: QuerySpec) -> dict:
    return {
        "name": spec.name,
        "description": spec.description,
        "query": spec.query,
        "selected_date_range": DEFAULT_RANGE,
        "selected_timestamp": DEFAULT_TIMESTAMP,
        "selected_fields": DEFAULT_FIELDS,
    }
 def saved_visualization(spec: VisualizationSpec) -> dict:
    return {
        "name": spec.name,
        "description": spec.description,
        "query": spec.query,
        "type": spec.vis_type,
        "selected_date_range": DEFAULT_RANGE,
        "selected_timestamp": DEFAULT_TIMESTAMP,
        "selected_fields": DEFAULT_FIELDS,
    }
 def build_objects() -> tuple[list[dict], list[dict], list[dict]]:
    kube_error = error_filter(["log", "message"])
    journald_error = error_filter(["MESSAGE"])
    apps = [
        AppSpec("bstein-dev-home", source_query("kube-*", "kubernetes.namespace_name = 'bstein-dev-home'")),
        AppSpec(
            "pegasus",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'pegasus'",
            ),
        ),
        AppSpec(
            "jellyfin",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'jellyfin' and kubernetes.labels.app = 'jellyfin'",
            ),
        ),
        AppSpec("vaultwarden", source_query("kube-*", "kubernetes.namespace_name = 'vaultwarden'")),
        AppSpec("mailu", source_query("kube-*", "kubernetes.namespace_name = 'mailu-mailserver'")),
        AppSpec("nextcloud", source_query("kube-*", "kubernetes.namespace_name = 'nextcloud'")),
        AppSpec("gitea", source_query("kube-*", "kubernetes.namespace_name = 'gitea'")),
        AppSpec("jenkins", source_query("kube-*", "kubernetes.namespace_name = 'jenkins'")),
        AppSpec("harbor", source_query("kube-*", "kubernetes.namespace_name = 'harbor'")),
        AppSpec("vault", source_query("kube-*", "kubernetes.namespace_name = 'vault'")),
        AppSpec("keycloak", source_query("kube-*", "kubernetes.namespace_name = 'sso'")),
        AppSpec("flux-system", source_query("kube-*", "kubernetes.namespace_name = 'flux-system'")),
        AppSpec("comms", source_query("kube-*", "kubernetes.namespace_name = 'comms'")),
        AppSpec(
            "element-web",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'element-web'",
            ),
        ),
        AppSpec(
            "element-call",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'element-call'",
            ),
        ),
        AppSpec(
            "matrix-synapse",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'comms' and kubernetes.container_name = 'synapse'",
            ),
        ),
        AppSpec(
            "livekit",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'livekit'",
            ),
        ),
        AppSpec(
            "coturn",
            source_query(
                "kube-*",
                "kubernetes.namespace_name = 'comms' and kubernetes.labels.app = 'coturn'",
            ),
        ),
        AppSpec(
            "lesavka",
            source_query("journald-*", "_HOSTNAME = 'titan-jh'"),
            kind="journald",
        ),
    ]
    applications = [
        {
            "name": app.name,
            "description": app.description,
            "baseQuery": app.base_query,
            "servicesEntities": [],
            "traceGroups": [app.name],
        }
        for app in apps
    ]
    queries = [
        saved_query(QuerySpec("kube logs", source_query("kube-*"))),
        saved_query(QuerySpec("kube errors", f"{source_query('kube-*')} | where {kube_error}")),
        saved_query(QuerySpec("journald logs", source_query("journald-*"))),
        saved_query(QuerySpec("journald errors", f"{source_query('journald-*')} | where {journald_error}")),
    ]
    for app in apps:
        query_base = app.base_query
        error_clause = journald_error if app.kind == "journald" else kube_error
        queries.append(saved_query(QuerySpec(f"{app.name} logs", query_base)))
        queries.append(saved_query(QuerySpec(f"{app.name} errors", f"{query_base} | where {error_clause}")))
    visualizations = [
        saved_visualization(
            VisualizationSpec(
                "[Kube] Logs per hour",
                "source = kube-* | stats count() as log_count by span(`@timestamp`, 1h)",
                "line",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Kube] Errors per hour",
                f"source = kube-* | where {kube_error} | stats count() as error_count by span(`@timestamp`, 1h)",
                "line",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Kube] Top namespaces",
                "source = kube-* | stats count() as log_count by kubernetes.namespace_name | sort - log_count",
                "bar",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Kube] Top error namespaces",
                f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.namespace_name | sort - error_count",
                "bar",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Kube] Top pods",
                "source = kube-* | stats count() as log_count by kubernetes.pod_name | sort - log_count",
                "bar",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Kube] Top error pods",
                f"source = kube-* | where {kube_error} | stats count() as error_count by kubernetes.pod_name | sort - error_count",
                "bar",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Kube] Top nodes",
                "source = kube-* | stats count() as log_count by kubernetes.node_name | sort - log_count",
                "bar",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Journald] Top units",
                "source = journald-* | stats count() as log_count by _SYSTEMD_UNIT | sort - log_count",
                "bar",
            )
        ),
        saved_visualization(
            VisualizationSpec(
                "[Journald] Top error units",
                f"source = journald-* | where {journald_error} | stats count() as error_count by _SYSTEMD_UNIT | sort - error_count",
                "bar",
            )
        ),
    ]
    return applications, queries, visualizations
 def write_json(payload: list[dict], path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(payload, indent=2) + "\n")
 def render_configmap(apps_path: Path, queries_path: Path, vis_path: Path, output_path: Path) -> None:
    relative_path = output_path.relative_to(ROOT)
    applications = indent_payload(apps_path)
    queries = indent_payload(queries_path)
    visualizations = indent_payload(vis_path)
    output_path.write_text(
        CONFIG_TEMPLATE.format(
            relative_path=relative_path,
            applications=applications,
            queries=queries,
            visualizations=visualizations,
        )
    )
 def indent_payload(path: Path) -> str:
    lines = path.read_text().splitlines()
    return "\n".join("    " + line for line in lines)
 def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--build", action="store_true", help="Regenerate JSON payloads and ConfigMap")
    args = parser.parse_args()
    if args.build:
        applications, queries, visualizations = build_objects()
        write_json(applications, APPS_PATH)
        write_json(queries, QUERIES_PATH)
        write_json(visualizations, VIS_PATH)
    if not (APPS_PATH.exists() and QUERIES_PATH.exists() and VIS_PATH.exists()):
        raise SystemExit("Missing observability JSON payloads. Run with --build first.")
    render_configmap(APPS_PATH, QUERIES_PATH, VIS_PATH, CONFIG_PATH)
 if __name__ == "__main__":
    main()
--- a/scripts/monitoring_postmark_exporter.py
+++ b/scripts/monitoring_postmark_exporter.py
@ -0,0 +1,149 @@
 #!/usr/bin/env python3
 import datetime as dt
 import os
 import time
 from dataclasses import dataclass
 import requests
 from prometheus_client import Gauge, Info, start_http_server
@dataclass(frozen=True)
 class Window:
    label: str
    days: int
 WINDOWS = [
    Window("today", 0),
    Window("1d", 1),
    Window("7d", 7),
    Window("30d", 30),
 ]
 API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/")
 POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60"))
 LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
 LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000"))
 PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip()
 FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip()
 LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip()
 LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip()
 try:
    SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0
 except ValueError:
    SENDING_LIMIT = 0.0
 EXPORTER_INFO = Info("postmark_exporter", "Exporter build info")
 EXPORTER_INFO.info(
    {
        "api_base": API_BASE,
        "windows": ",".join(window.label for window in WINDOWS),
    }
 )
 POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)")
 POSTMARK_LAST_SUCCESS = Gauge(
    "postmark_last_success_timestamp_seconds",
    "Unix timestamp of the last successful Postmark stats refresh",
 )
 POSTMARK_REQUEST_ERRORS = Gauge(
    "postmark_request_errors_total",
    "Total Postmark stats request errors since exporter start",
 )
 POSTMARK_OUTBOUND_SENT = Gauge(
    "postmark_outbound_sent",
    "Outbound emails sent within the selected window",
    labelnames=("window",),
 )
 POSTMARK_OUTBOUND_BOUNCED = Gauge(
    "postmark_outbound_bounced",
    "Outbound emails bounced within the selected window",
    labelnames=("window",),
 )
 POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge(
    "postmark_outbound_bounce_rate",
    "Outbound bounce rate percentage within the selected window",
    labelnames=("window",),
 )
 POSTMARK_SENDING_LIMIT_GAUGE = Gauge(
    "postmark_sending_limit",
    "Configured Postmark sending limit for the active account",
 )
 POSTMARK_SENDING_LIMIT_USED = Gauge(
    "postmark_sending_limit_used",
    "Messages sent within the configured send limit window",
 )
 POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge(
    "postmark_sending_limit_used_percent",
    "Percent of the configured send limit used within the limit window",
 )
 def fetch_outbound_stats(token: str, window: Window) -> dict:
    today = dt.date.today()
    fromdate = today - dt.timedelta(days=window.days)
    params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()}
    headers = {
        "Accept": "application/json",
        "X-Postmark-Server-Token": token,
    }
    response = requests.get(
        f"{API_BASE}/stats/outbound",
        headers=headers,
        params=params,
        timeout=15,
    )
    response.raise_for_status()
    return response.json()
 def update_metrics(token: str) -> None:
    sent_by_window = {}
    for window in WINDOWS:
        data = fetch_outbound_stats(token, window)
        sent = int(data.get("Sent", 0) or 0)
        bounced = int(data.get("Bounced", 0) or 0)
        rate = (bounced / sent * 100.0) if sent else 0.0
        sent_by_window[window.label] = sent
        POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent)
        POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced)
        POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate)
    POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT)
    limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0)
    POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent)
    if SENDING_LIMIT:
        POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0)
    else:
        POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0)
 def main() -> None:
    if not PRIMARY_TOKEN and not FALLBACK_TOKEN:
        raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required")
    start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS)
    tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token]
    token_index = 0
    while True:
        token = tokens[token_index % len(tokens)]
        token_index += 1
        try:
            update_metrics(token)
            POSTMARK_API_UP.set(1)
            POSTMARK_LAST_SUCCESS.set(time.time())
        except Exception as exc:  # noqa: BLE001
            POSTMARK_API_UP.set(0)
            POSTMARK_REQUEST_ERRORS.inc()
            print(f"postmark_exporter: refresh failed: {exc}", flush=True)
        time.sleep(POLL_INTERVAL_SECONDS)
 if __name__ == "__main__":
    main()
--- a/scripts/monitoring_render_postmark_exporter.py
+++ b/scripts/monitoring_render_postmark_exporter.py
@ -0,0 +1,35 @@
 #!/usr/bin/env python3
 from pathlib import Path
 def indent(text: str, spaces: int) -> str:
    prefix = " " * spaces
    return "".join(prefix + line if line.strip("\n") else line for line in text.splitlines(keepends=True))
 def main() -> None:
    root = Path(__file__).resolve().parents[1]
    source = root / "scripts" / "monitoring_postmark_exporter.py"
    target = root / "services" / "monitoring" / "postmark-exporter-script.yaml"
    payload = source.read_text(encoding="utf-8")
    if not payload.endswith("\n"):
        payload += "\n"
    yaml = (
        f"# services/monitoring/postmark-exporter-script.yaml\n"
        f"apiVersion: v1\n"
        f"kind: ConfigMap\n"
        f"metadata:\n"
        f"  name: postmark-exporter-script\n"
        f"data:\n"
        f"  monitoring_postmark_exporter.py: |\n"
        f"{indent(payload, 4)}"
    )
    target.write_text(yaml, encoding="utf-8")
 if __name__ == "__main__":
    main()
--- a/scripts/nextcloud-mail-sync.sh
+++ b/scripts/nextcloud-mail-sync.sh
@ -1,49 +0,0 @@
 #!/bin/bash
 set -euo pipefail
 KC_BASE="${KC_BASE:?}"
 KC_REALM="${KC_REALM:?}"
 KC_ADMIN_USER="${KC_ADMIN_USER:?}"
 KC_ADMIN_PASS="${KC_ADMIN_PASS:?}"
 if ! command -v jq >/dev/null 2>&1; then
  apt-get update && apt-get install -y jq curl >/dev/null
 fi
 account_exists() {
  # Skip if the account email is already present in the mail app.
  runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq " ${1}" || \
    runuser -u www-data -- php occ mail:account:list 2>/dev/null | grep -Fq "${1} "
 }
 token=$(
  curl -s -d "grant_type=password" \
    -d "client_id=admin-cli" \
    -d "username=${KC_ADMIN_USER}" \
    -d "password=${KC_ADMIN_PASS}" \
    "${KC_BASE}/realms/master/protocol/openid-connect/token" | jq -r '.access_token'
 )
 if [[ -z "${token}" || "${token}" == "null" ]]; then
  echo "Failed to obtain admin token"
  exit 1
 fi
 users=$(curl -s -H "Authorization: Bearer ${token}" \
  "${KC_BASE}/admin/realms/${KC_REALM}/users?max=2000")
 echo "${users}" | jq -c '.[]' | while read -r user; do
  username=$(echo "${user}" | jq -r '.username')
  email=$(echo "${user}" | jq -r '.email // empty')
  app_pw=$(echo "${user}" | jq -r '.attributes.mailu_app_password[0] // empty')
  [[ -z "${email}" || -z "${app_pw}" ]] && continue
  if account_exists "${email}"; then
    echo "Skipping ${email}, already exists"
    continue
  fi
  echo "Syncing ${email}"
  runuser -u www-data -- php occ mail:account:create \
    "${username}" "${username}" "${email}" \
    mail.bstein.dev 993 ssl "${email}" "${app_pw}" \
    mail.bstein.dev 587 tls "${email}" "${app_pw}" login || true
 done
--- a/scripts/nextcloud-maintenance.sh
+++ b/scripts/nextcloud-maintenance.sh
@ -1,65 +0,0 @@
 #!/bin/bash
 set -euo pipefail
 NC_URL="${NC_URL:-https://cloud.bstein.dev}"
 ADMIN_USER="${ADMIN_USER:?}"
 ADMIN_PASS="${ADMIN_PASS:?}"
 export DEBIAN_FRONTEND=noninteractive
 apt-get update -qq
 apt-get install -y -qq curl jq >/dev/null
 run_occ() {
  runuser -u www-data -- php occ "$@"
 }
 log() { echo "[$(date -Is)] $*"; }
 log "Applying Atlas theming"
 run_occ theming:config name "Atlas Cloud"
 run_occ theming:config slogan "Unified access to Atlas services"
 run_occ theming:config url "https://cloud.bstein.dev"
 run_occ theming:config color "#0f172a"
 run_occ theming:config disable-user-theming yes
 log "Setting default quota to 200 GB"
 run_occ config:app:set files default_quota --value "200 GB"
 API_BASE="${NC_URL}/ocs/v2.php/apps/external/api/v1"
 AUTH=(-u "${ADMIN_USER}:${ADMIN_PASS}" -H "OCS-APIRequest: true")
 log "Removing existing external links"
 existing=$(curl -sf "${AUTH[@]}" "${API_BASE}?format=json" | jq -r '.ocs.data[].id // empty')
 for id in ${existing}; do
  curl -sf "${AUTH[@]}" -X DELETE "${API_BASE}/sites/${id}?format=json" >/dev/null || true
 done
 SITES=(
  "Vaultwarden|https://vault.bstein.dev"
  "Jellyfin|https://stream.bstein.dev"
  "Gitea|https://scm.bstein.dev"
  "Jenkins|https://ci.bstein.dev"
  "Harbor|https://registry.bstein.dev"
  "Vault|https://secret.bstein.dev"
  "Jitsi|https://meet.bstein.dev"
  "Grafana|https://metrics.bstein.dev"
  "Chat LLM|https://chat.ai.bstein.dev"
  "Vision|https://draw.ai.bstein.dev"
  "STT/TTS|https://talk.ai.bstein.dev"
 )
 log "Seeding external links"
 for entry in "${SITES[@]}"; do
  IFS="|" read -r name url <<<"${entry}"
  curl -sf "${AUTH[@]}" -X POST "${API_BASE}/sites?format=json" \
    -d "name=${name}" \
    -d "url=${url}" \
    -d "lang=" \
    -d "type=link" \
    -d "device=" \
    -d "icon=" \
    -d "groups[]=" \
    -d "redirect=1" >/dev/null
 done
 log "Maintenance run completed"
--- a/scripts/test_atlas_user_cleanup.py
+++ b/scripts/test_atlas_user_cleanup.py
@ -0,0 +1,509 @@
 #!/usr/bin/env python3
 """Clean up Atlas test users and portal requests (manual-only).
 Default behavior is DRY RUN. This script is intended for operators to clean up
 test accounts created via the bstein-dev-home onboarding portal.
 Targets (best-effort):
  - Keycloak users in realm "atlas"
  - Atlas portal Postgres rows (access_requests + dependent tables)
  - Vaultwarden users/invites created by the portal
 Safety:
  - Requires an explicit username prefix (e.g. "test-")
  - Dry-run unless --apply is set
  - --apply requires an explicit --confirm guard
  - Validates prefixes to a conservative charset
 """
 from __future__ import annotations
 import argparse
 import base64
 import json
 import os
 import re
 import subprocess
 import sys
 import time
 import urllib.parse
 import urllib.request
 from dataclasses import dataclass
 from typing import Any, Iterable
 _SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
@dataclass(frozen=True)
 class KeycloakUser:
    user_id: str
    username: str
    email: str
@dataclass(frozen=True)
 class PortalRequestRow:
    request_code: str
    username: str
    status: str
@dataclass(frozen=True)
 class VaultwardenUser:
    user_id: str
    email: str
    status: int
 def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
    proc = subprocess.run(
        cmd,
        input=input_bytes,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        check=False,
    )
    if proc.returncode != 0:
        stderr = proc.stderr.decode("utf-8", errors="replace").strip()
        raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
    return proc.stdout.decode("utf-8", errors="replace")
 def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
    raw_b64 = _run(
        [
            "kubectl",
            "-n",
            namespace,
            "get",
            "secret",
            name,
            "-o",
            f"jsonpath={{.data.{key}}}",
        ]
    ).strip()
    if not raw_b64:
        raise RuntimeError(f"secret {namespace}/{name} key {key} is empty")
    return base64.b64decode(raw_b64).decode("utf-8").strip()
 def _kubectl_first_pod(namespace: str) -> str:
    raw = _run(
        [
            "kubectl",
            "-n",
            namespace,
            "get",
            "pods",
            "-o",
            "json",
        ]
    )
    data = json.loads(raw)
    items = data.get("items") or []
    if not isinstance(items, list) or not items:
        raise RuntimeError(f"no pods found in namespace {namespace}")
    pod_name = items[0].get("metadata", {}).get("name")
    if not isinstance(pod_name, str) or not pod_name:
        raise RuntimeError(f"unexpected pod list in namespace {namespace}")
    return pod_name
 def _validate_prefixes(prefixes: list[str]) -> list[str]:
    cleaned: list[str] = []
    for prefix in prefixes:
        prefix = prefix.strip()
        if not prefix:
            continue
        if not _SAFE_PREFIX_RE.match(prefix):
            raise SystemExit(
                f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
            )
        cleaned.append(prefix)
    if not cleaned:
        raise SystemExit("at least one --prefix is required")
    return cleaned
 def _starts_with_any(value: str, prefixes: Iterable[str]) -> bool:
    return any(value.startswith(p) for p in prefixes)
 def _keycloak_token(server: str, realm: str, client_id: str, client_secret: str) -> str:
    data = urllib.parse.urlencode(
        {
            "grant_type": "client_credentials",
            "client_id": client_id,
            "client_secret": client_secret,
        }
    ).encode("utf-8")
    req = urllib.request.Request(
        f"{server}/realms/{realm}/protocol/openid-connect/token",
        data=data,
        method="POST",
    )
    req.add_header("Content-Type", "application/x-www-form-urlencoded")
    with urllib.request.urlopen(req, timeout=15) as resp:
        payload = json.loads(resp.read().decode("utf-8"))
    token = payload.get("access_token")
    if not isinstance(token, str) or not token:
        raise RuntimeError("failed to obtain keycloak access token")
    return token
 def _keycloak_list_users(server: str, realm: str, token: str, search: str) -> list[KeycloakUser]:
    query = urllib.parse.urlencode({"max": "1000", "search": search})
    req = urllib.request.Request(f"{server}/admin/realms/{realm}/users?{query}", method="GET")
    req.add_header("Authorization", f"Bearer {token}")
    with urllib.request.urlopen(req, timeout=30) as resp:
        payload = json.loads(resp.read().decode("utf-8"))
    if not isinstance(payload, list):
        raise RuntimeError("unexpected keycloak users response")
    users: list[KeycloakUser] = []
    for item in payload:
        if not isinstance(item, dict):
            continue
        user_id = item.get("id")
        username = item.get("username") or ""
        email = item.get("email") or ""
        if not isinstance(user_id, str) or not user_id:
            continue
        if not isinstance(username, str):
            continue
        users.append(KeycloakUser(user_id=user_id, username=username, email=str(email)))
    return users
 def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) -> None:
    req = urllib.request.Request(f"{server}/admin/realms/{realm}/users/{user_id}", method="DELETE")
    req.add_header("Authorization", f"Bearer {token}")
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            _ = resp.read()
    except urllib.error.HTTPError as exc:
        if exc.code == 404:
            return
        raise
 def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
    postgres_pod = _kubectl_first_pod("postgres")
    out = _run(
        [
            "kubectl",
            "-n",
            "postgres",
            "exec",
            "-i",
            postgres_pod,
            "--",
            "psql",
            portal_db_url,
            "-At",
            "-F",
            "\t",
            "-c",
            sql,
        ]
    )
    rows: list[dict[str, Any]] = []
    for line in out.splitlines():
        parts = line.split("\t")
        rows.append({"cols": parts})
    return rows
 def _portal_list_requests(portal_db_url: str, prefixes: list[str]) -> list[PortalRequestRow]:
    clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
    sql = (
        "SELECT request_code, username, status "
        "FROM access_requests "
        f"WHERE {clauses} "
        "ORDER BY created_at DESC;"
    )
    raw_rows = _psql_json(portal_db_url, sql)
    parsed: list[PortalRequestRow] = []
    for row in raw_rows:
        cols = row.get("cols") or []
        if len(cols) < 3:
            continue
        parsed.append(PortalRequestRow(request_code=cols[0], username=cols[1], status=cols[2]))
    return parsed
 def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
    clauses = " OR ".join([f"username LIKE '{p}%'" for p in prefixes])
    sql = f"DELETE FROM access_requests WHERE {clauses};"
    postgres_pod = _kubectl_first_pod("postgres")
    out = _run(
        [
            "kubectl",
            "-n",
            "postgres",
            "exec",
            "-i",
            postgres_pod,
            "--",
            "psql",
            portal_db_url,
            "-c",
            sql,
        ]
    )
    # psql prints "DELETE <n>"
    match = re.search(r"DELETE\\s+(\\d+)", out)
    return int(match.group(1)) if match else 0
 def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
    data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
    req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
    req.add_header("Content-Type", "application/x-www-form-urlencoded")
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            set_cookie = resp.headers.get("Set-Cookie") or ""
    except urllib.error.HTTPError as exc:
        if exc.code == 429:
            raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
        raise
    cookie = set_cookie.split(";", 1)[0].strip()
    if not cookie:
        raise RuntimeError("vaultwarden admin cookie missing")
    return cookie
 def _vaultwarden_list_users(base_url: str, cookie: str) -> list[VaultwardenUser]:
    req = urllib.request.Request(f"{base_url}/admin/users", method="GET")
    req.add_header("Cookie", cookie)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            payload = json.loads(resp.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        if exc.code == 429:
            raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
        raise
    if not isinstance(payload, list):
        raise RuntimeError("unexpected vaultwarden /admin/users response")
    users: list[VaultwardenUser] = []
    for item in payload:
        if not isinstance(item, dict):
            continue
        user_id = item.get("id")
        email = item.get("email")
        status = item.get("_status")
        if not isinstance(user_id, str) or not user_id:
            continue
        if not isinstance(email, str) or not email:
            continue
        if not isinstance(status, int):
            status = -1
        users.append(VaultwardenUser(user_id=user_id, email=email, status=status))
    return users
 def _vaultwarden_delete_user(base_url: str, cookie: str, user_id: str) -> None:
    req = urllib.request.Request(f"{base_url}/admin/users/{user_id}", method="DELETE")
    req.add_header("Cookie", cookie)
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            _ = resp.read()
    except urllib.error.HTTPError as exc:
        if exc.code in {404}:
            return
        if exc.code == 429:
            raise RuntimeError("vaultwarden admin rate limited (HTTP 429)") from exc
        raise
 def _port_forward(namespace: str, target: str, local_port: int, remote_port: int) -> subprocess.Popen[bytes]:
    # Keep stdout/stderr muted to avoid leaking internal details in output.
    return subprocess.Popen(
        [
            "kubectl",
            "-n",
            namespace,
            "port-forward",
            target,
            f"{local_port}:{remote_port}",
            "--address",
            "127.0.0.1",
        ],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--prefix",
        action="append",
        default=[],
        help="Username prefix to match (repeatable). Example: --prefix test-",
    )
    parser.add_argument(
        "--apply",
        action="store_true",
        help="Actually delete; otherwise dry-run only.",
    )
    parser.add_argument(
        "--confirm",
        default="",
        help=(
            "Required when using --apply. Must exactly equal the comma-separated "
            "sorted prefix list (e.g. 'atlas-,bob-,e2e-,test-')."
        ),
    )
    parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
    parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
    parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
    parser.add_argument(
        "--protect-keycloak-username",
        action="append",
        default=[],
        help="Keycloak usernames that must never be deleted (repeatable).",
    )
    parser.add_argument(
        "--protect-vaultwarden-email",
        action="append",
        default=[],
        help="Vaultwarden emails that must never be deleted (repeatable).",
    )
    args = parser.parse_args()
    prefixes = sorted(set(_validate_prefixes(args.prefix)))
    apply = bool(args.apply)
    expected_confirm = ",".join(prefixes)
    protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
    protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
    if apply and args.confirm != expected_confirm:
        raise SystemExit(
            f"refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')"
        )
    print("Atlas test-user cleanup")
    print("prefixes:", expected_confirm)
    print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
    if protected_keycloak:
        print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
    if protected_vaultwarden:
        print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
    print()
    if not args.skip_portal_db:
        portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
        requests = _portal_list_requests(portal_db_url, prefixes)
        print(f"Portal DB: {len(requests)} access_requests matched")
        for row in requests[:50]:
            print(f"  {row.request_code}\t{row.status}\t{row.username}")
        if len(requests) > 50:
            print(f"  ... and {len(requests) - 50} more")
        if apply and requests:
            deleted = _portal_delete_requests(portal_db_url, prefixes)
            print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
        print()
    if not args.skip_keycloak:
        kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
        kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
        kc_client_id = os.getenv("KEYCLOAK_ADMIN_CLIENT_ID", "bstein-dev-home-admin")
        kc_client_secret = _kubectl_get_secret_value(
            "bstein-dev-home", "bstein-dev-home-keycloak-admin", "client_secret"
        )
        token = _keycloak_token(kc_server, kc_realm, kc_client_id, kc_client_secret)
        found: dict[str, KeycloakUser] = {}
        for prefix in prefixes:
            for user in _keycloak_list_users(kc_server, kc_realm, token, prefix):
                if not _starts_with_any(user.username, prefixes):
                    continue
                if user.username in protected_keycloak:
                    continue
                found[user.user_id] = user
        users = list(found.values())
        users.sort(key=lambda u: u.username)
        print(f"Keycloak: {len(users)} users matched")
        for user in users[:50]:
            email = user.email or "-"
            print(f"  {user.username}\t{email}\t{user.user_id}")
        if len(users) > 50:
            print(f"  ... and {len(users) - 50} more")
        if apply and users:
            for user in users:
                _keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
            print(f"Keycloak: deleted {len(users)} users.")
        print()
    if not args.skip_vaultwarden:
        pf = _port_forward("vaultwarden", "svc/vaultwarden-service", 18081, 80)
        try:
            # wait briefly for the port-forward to come up
            for _ in range(30):
                try:
                    urllib.request.urlopen("http://127.0.0.1:18081/", timeout=1).read(1)
                    break
                except Exception:
                    time.sleep(0.2)
            admin_token = _kubectl_get_secret_value("vaultwarden", "vaultwarden-admin", "ADMIN_TOKEN")
            base_url = "http://127.0.0.1:18081"
            try:
                cookie = ""
                for attempt in range(7):
                    try:
                        cookie = _vaultwarden_admin_cookie(admin_token, base_url)
                        break
                    except RuntimeError as exc:
                        if "rate limited" in str(exc).lower():
                            time.sleep(min(60.0, 2.0**attempt))
                            continue
                        raise
                if not cookie:
                    raise RuntimeError("vaultwarden admin login repeatedly rate limited")
                users: list[VaultwardenUser] = []
                for attempt in range(7):
                    try:
                        users = _vaultwarden_list_users(base_url, cookie)
                        break
                    except RuntimeError as exc:
                        if "rate limited" in str(exc).lower():
                            time.sleep(min(60.0, 2.0**attempt))
                            continue
                        raise
                if not users:
                    raise RuntimeError("vaultwarden user list unavailable (possibly rate limited)")
            except RuntimeError as exc:
                print(f"Vaultwarden: ERROR: {exc}")
                print()
                return 1
            matched: list[VaultwardenUser] = []
            for user in users:
                local = user.email.split("@", 1)[0]
                if _starts_with_any(local, prefixes):
                    if user.email in protected_vaultwarden:
                        continue
                    matched.append(user)
            matched.sort(key=lambda u: u.email)
            print(f"Vaultwarden: {len(matched)} users matched")
            for user in matched[:50]:
                print(f"  {user.email}\tstatus={user.status}\t{user.user_id}")
            if len(matched) > 50:
                print(f"  ... and {len(matched) - 50} more")
            if apply and matched:
                for user in matched:
                    _vaultwarden_delete_user(base_url, cookie, user.user_id)
                print(f"Vaultwarden: deleted {len(matched)} users.")
            print()
        finally:
            pf.terminate()
            try:
                pf.wait(timeout=3)
            except Exception:
                pf.kill()
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/test_user_cleanup.py
+++ b/scripts/test_user_cleanup.py
@ -0,0 +1,276 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import sys
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, Iterable
 from urllib.parse import quote
 import httpx
 from atlas_portal import db, settings
 from atlas_portal.keycloak import admin_client
@dataclass(frozen=True)
 class KeycloakUser:
    id: str
    username: str
@dataclass(frozen=True)
 class PortalRequest:
    request_code: str
    username: str
    status: str
 def _dedupe_by_id(users: Iterable[KeycloakUser]) -> list[KeycloakUser]:
    seen: set[str] = set()
    out: list[KeycloakUser] = []
    for user in users:
        if user.id in seen:
            continue
        seen.add(user.id)
        out.append(user)
    return out
 def _iter_keycloak_users_for_prefix(prefix: str, max_results: int) -> list[KeycloakUser]:
    client = admin_client()
    if not client.ready():
        raise RuntimeError("keycloak admin client not configured in this environment")
    url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
    # Keycloak can return false positives for search; we do a strict prefix match client-side.
    params = {"search": prefix, "max": str(max_results), "briefRepresentation": "true"}
    with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
        resp = http.get(url, params=params, headers=client.headers())
        resp.raise_for_status()
        payload = resp.json()
    if not isinstance(payload, list):
        return []
    found: list[KeycloakUser] = []
    for item in payload:
        if not isinstance(item, dict):
            continue
        username = item.get("username")
        user_id = item.get("id")
        if not isinstance(username, str) or not isinstance(user_id, str):
            continue
        if not username.startswith(prefix):
            continue
        if username.startswith("service-account-"):
            continue
        found.append(KeycloakUser(id=user_id, username=username))
    return found
 def _find_keycloak_users(prefixes: list[str], max_results: int, protected: set[str]) -> list[KeycloakUser]:
    matches: list[KeycloakUser] = []
    for prefix in prefixes:
        matches.extend(_iter_keycloak_users_for_prefix(prefix, max_results=max_results))
    deduped = _dedupe_by_id(matches)
    return [user for user in deduped if user.username not in protected]
 def _delete_keycloak_users(users: list[KeycloakUser]) -> None:
    if not users:
        return
    client = admin_client()
    if not client.ready():
        raise RuntimeError("keycloak admin client not configured in this environment")
    base = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
    with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
        for user in users:
            url = f"{base}/{quote(user.id, safe='')}"
            resp = http.delete(url, headers=client.headers())
            # Deleting a non-existent user is treated as success for idempotency.
            if resp.status_code == 404:
                continue
            resp.raise_for_status()
 def _find_portal_requests(prefixes: list[str], max_results: int) -> list[PortalRequest]:
    if not db.configured():
        return []
    like_prefixes = [f"{prefix}%" for prefix in prefixes]
    rows: list[dict[str, Any]] = []
    with db.connect() as conn:
        for like in like_prefixes:
            cursor = conn.execute(
                """
                SELECT request_code, username, status
                FROM access_requests
                WHERE username LIKE %s
                ORDER BY created_at DESC
                LIMIT %s
                """,
                (like, max_results),
            )
            batch = cursor.fetchall()
            if isinstance(batch, list):
                rows.extend([r for r in batch if isinstance(r, dict)])
    out: list[PortalRequest] = []
    for row in rows:
        request_code = row.get("request_code")
        username = row.get("username")
        status = row.get("status")
        if not isinstance(request_code, str) or not isinstance(username, str) or not isinstance(status, str):
            continue
        out.append(PortalRequest(request_code=request_code, username=username, status=status))
    return out
 def _delete_portal_requests(prefixes: list[str]) -> int:
    if not db.configured():
        return 0
    like_prefixes = [f"{prefix}%" for prefix in prefixes]
    deleted = 0
    with db.connect() as conn:
        for like in like_prefixes:
            cursor = conn.execute("DELETE FROM access_requests WHERE username LIKE %s", (like,))
            deleted += cursor.rowcount or 0
    return deleted
 def _summarize_portal_requests(rows: list[PortalRequest]) -> dict[str, int]:
    counts: dict[str, int] = defaultdict(int)
    for row in rows:
        counts[row.status] += 1
    return dict(counts)
 def _parse_args(argv: list[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        prog="test_user_cleanup",
        description=(
            "Manual-only cleanup for test users/requests. "
            "This script is intended to be run inside the bstein-dev-home backend container."
        ),
    )
    parser.add_argument(
        "--prefix",
        action="append",
        required=True,
        help="Username prefix to target (repeatable). Example: --prefix test-",
    )
    parser.add_argument(
        "--max",
        type=int,
        default=500,
        help="Maximum users/requests to enumerate per prefix (default: 500).",
    )
    parser.add_argument(
        "--apply",
        action="store_true",
        help="Apply deletions (default is dry-run). Requires --confirm.",
    )
    parser.add_argument(
        "--confirm",
        default="",
        help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
    )
    parser.add_argument(
        "--skip-keycloak",
        action="store_true",
        help="Skip deleting Keycloak users.",
    )
    parser.add_argument(
        "--skip-portal",
        action="store_true",
        help="Skip deleting portal (DB) access requests.",
    )
    parser.add_argument(
        "--protect",
        action="append",
        default=[],
        help="Extra usernames to never delete (repeatable).",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="List matched usernames/request codes.",
    )
    return parser.parse_args(argv)
 def main(argv: list[str]) -> int:
    args = _parse_args(argv)
    prefixes = sorted({p.strip() for p in args.prefix if p.strip()})
    if not prefixes:
        print("error: no valid --prefix values provided", file=sys.stderr)
        return 2
    expected_confirm = ",".join(prefixes)
    protected = {"bstein", "robotuser", *[p.strip() for p in args.protect if p.strip()]}
    if args.apply and args.confirm != expected_confirm:
        print(
            f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
            file=sys.stderr,
        )
        return 2
    keycloak_users: list[KeycloakUser] = []
    portal_requests: list[PortalRequest] = []
    if not args.skip_keycloak:
        keycloak_users = _find_keycloak_users(prefixes, max_results=args.max, protected=protected)
    if not args.skip_portal:
        portal_requests = _find_portal_requests(prefixes, max_results=args.max)
    print(f"prefixes: {expected_confirm}")
    print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
    if protected:
        print(f"protected usernames: {', '.join(sorted(protected))}")
    if not args.skip_keycloak:
        print(f"keycloak users matched: {len(keycloak_users)}")
        if args.verbose and keycloak_users:
            for user in sorted(keycloak_users, key=lambda u: u.username):
                print(f"  - {user.username}")
    if not args.skip_portal:
        print(f"portal requests matched: {len(portal_requests)}")
        if portal_requests:
            summary = _summarize_portal_requests(portal_requests)
            summary_str = ", ".join(f"{k}={v}" for k, v in sorted(summary.items()))
            print(f"  statuses: {summary_str}")
        if args.verbose and portal_requests:
            for req in portal_requests[: min(50, len(portal_requests))]:
                print(f"  - {req.request_code} ({req.status})")
            if len(portal_requests) > 50:
                print(f"  ... and {len(portal_requests) - 50} more")
    if not args.apply:
        print("dry-run complete (no changes made)")
        return 0
    if not args.skip_portal:
        deleted = _delete_portal_requests(prefixes)
        print(f"deleted portal requests: {deleted}")
    if not args.skip_keycloak:
        _delete_keycloak_users(keycloak_users)
        print(f"deleted keycloak users: {len(keycloak_users)}")
    print("done")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))
--- a/scripts/test_user_cleanup.sh
+++ b/scripts/test_user_cleanup.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Manual-only helper to run `scripts/test_user_cleanup.py` inside the portal backend container.
 #
 # Usage (dry-run):
 #   scripts/test_user_cleanup.sh --prefix test-
 #
 # Usage (apply):
 #   scripts/test_user_cleanup.sh --prefix test- --apply --confirm test-
 NS="${PORTAL_NAMESPACE:-bstein-dev-home}"
 TARGET="${PORTAL_BACKEND_EXEC_TARGET:-deploy/bstein-dev-home-backend}"
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 cat "${SCRIPT_DIR}/test_user_cleanup.py" | kubectl -n "${NS}" exec -i "${TARGET}" -- python - "$@"
--- a/scripts/test_vaultwarden_user_cleanup.py
+++ b/scripts/test_vaultwarden_user_cleanup.py
@ -0,0 +1,318 @@
 #!/usr/bin/env python3
 """Clean up Vaultwarden test users and invites (manual-only).
 This script deletes Vaultwarden rows directly from the Postgres database. It is
 intended only for removing test fallout (e.g. e2e-*, test-*) and is deliberately
 conservative:
 - Requires one or more explicit email prefixes (repeatable).
 - Dry-run by default; --apply requires an exact --confirm guard.
 - Refuses to delete any user with dependent data in Vaultwarden tables.
 - Supports a protected email allowlist to prevent catastrophic mistakes.
 Example (dry-run):
  scripts/test_vaultwarden_user_cleanup.py --prefix e2e-
 Example (apply):
  scripts/test_vaultwarden_user_cleanup.py --prefix e2e- --apply --confirm e2e-
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import subprocess
 import sys
 from dataclasses import dataclass
 from typing import Iterable, Sequence
 _SAFE_PREFIX_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}$")
 _UUID_RE = re.compile(r"^[0-9a-fA-F-]{32,36}$")
@dataclass(frozen=True)
 class VaultwardenUser:
    uuid: str
    email: str
    dependent_rows: int
 def _run(cmd: Sequence[str], *, input_bytes: bytes | None = None) -> str:
    proc = subprocess.run(
        list(cmd),
        input=input_bytes,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        check=False,
    )
    if proc.returncode != 0:
        stderr = proc.stderr.decode("utf-8", errors="replace").strip()
        raise RuntimeError(f"command failed ({proc.returncode}): {' '.join(cmd)}\n{stderr}")
    return proc.stdout.decode("utf-8", errors="replace")
 def _kubectl_first_pod(namespace: str) -> str:
    raw = _run(["kubectl", "-n", namespace, "get", "pods", "-o", "json"])
    data = json.loads(raw)
    items = data.get("items") or []
    if not isinstance(items, list) or not items:
        raise RuntimeError(f"no pods found in namespace {namespace}")
    name = items[0].get("metadata", {}).get("name")
    if not isinstance(name, str) or not name:
        raise RuntimeError(f"unexpected pod list in namespace {namespace}")
    return name
 def _psql(sql: str) -> str:
    pod = _kubectl_first_pod("postgres")
    return _run(
        [
            "kubectl",
            "-n",
            "postgres",
            "exec",
            "-i",
            pod,
            "--",
            "psql",
            "-U",
            "postgres",
            "-d",
            "vaultwarden",
            "-At",
            "-F",
            "\t",
            "-c",
            sql,
        ]
    )
 def _validate_prefixes(prefixes: Iterable[str]) -> list[str]:
    cleaned: list[str] = []
    for prefix in prefixes:
        prefix = prefix.strip()
        if not prefix:
            continue
        if not _SAFE_PREFIX_RE.match(prefix):
            raise SystemExit(
                f"invalid prefix '{prefix}': must match {_SAFE_PREFIX_RE.pattern} (alnum plus ._-)"
            )
        if not prefix.endswith("-"):
            raise SystemExit(f"refusing prefix '{prefix}': must end with '-' for safety")
        cleaned.append(prefix)
    if not cleaned:
        raise SystemExit("at least one --prefix is required")
    return sorted(set(cleaned))
 def _parse_rows(tsv: str) -> list[list[str]]:
    rows: list[list[str]] = []
    for line in tsv.splitlines():
        line = line.strip()
        if not line:
            continue
        rows.append(line.split("\t"))
    return rows
 def _sql_or_email_prefixes(prefixes: list[str]) -> str:
    # prefixes validated to safe charset; safe to interpolate.
    clauses = [f"email LIKE '{p}%'" for p in prefixes]
    return " OR ".join(clauses) if clauses else "FALSE"
 def _sql_quote(value: str) -> str:
    return "'" + value.replace("'", "''") + "'"
 def _sql_text_array(values: Iterable[str]) -> str:
    items = ",".join(_sql_quote(v) for v in values)
    return f"ARRAY[{items}]::text[]"
 def _list_users(prefixes: list[str], protected: set[str]) -> list[VaultwardenUser]:
    clause = _sql_or_email_prefixes(prefixes)
    sql = f"""
    WITH candidates AS (
      SELECT uuid, email
      FROM users
      WHERE enabled
        AND ({clause})
        AND email <> ALL({_sql_text_array(sorted(protected))})
    )
    SELECT
      candidates.uuid,
      candidates.email,
      (
        (SELECT COUNT(*) FROM auth_requests WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM ciphers WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM devices WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM emergency_access WHERE grantor_uuid = candidates.uuid OR grantee_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM favorites WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM folders WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM sends WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM twofactor WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM twofactor_incomplete WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM users_collections WHERE user_uuid = candidates.uuid) +
        (SELECT COUNT(*) FROM users_organizations WHERE user_uuid = candidates.uuid)
      ) AS dependent_rows
    FROM candidates
    ORDER BY candidates.email;
    """
    out = _psql(sql)
    users: list[VaultwardenUser] = []
    for row in _parse_rows(out):
        if len(row) < 3:
            continue
        uuid, email, dep_raw = row[0].strip(), row[1].strip(), row[2].strip()
        if not uuid or not email:
            continue
        if not _UUID_RE.match(uuid):
            continue
        try:
            dep = int(dep_raw)
        except ValueError:
            dep = 0
        users.append(VaultwardenUser(uuid=uuid, email=email, dependent_rows=dep))
    return users
 def _list_invitations(prefixes: list[str], protected: set[str]) -> list[str]:
    clause = _sql_or_email_prefixes(prefixes)
    protected_clause = ""
    if protected:
        protected_clause = f"AND email <> ALL({_sql_text_array(sorted(protected))})"
    sql = f"SELECT email FROM invitations WHERE ({clause}) {protected_clause} ORDER BY email;"
    out = _psql(sql)
    invites: list[str] = []
    for row in _parse_rows(out):
        if not row:
            continue
        email = row[0].strip()
        if email:
            invites.append(email)
    return invites
 def _delete_invitations(emails: list[str]) -> int:
    if not emails:
        return 0
    email_list = ",".join(_sql_quote(e) for e in emails)
    sql = f"DELETE FROM invitations WHERE email IN ({email_list});"
    out = _psql(sql)
    match = re.search(r"DELETE\s+(\d+)", out)
    return int(match.group(1)) if match else 0
 def _delete_users(uuids: list[str]) -> int:
    if not uuids:
        return 0
    uuid_list = ",".join(_sql_quote(u) for u in uuids)
    sql = f"DELETE FROM users WHERE uuid IN ({uuid_list});"
    out = _psql(sql)
    match = re.search(r"DELETE\s+(\d+)", out)
    return int(match.group(1)) if match else 0
 def _parse_args(argv: list[str]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        prog="test_vaultwarden_user_cleanup",
        description="Manual-only cleanup for Vaultwarden test users/invites (DB-level).",
    )
    parser.add_argument(
        "--prefix",
        action="append",
        required=True,
        help="Email prefix to target (repeatable). Example: --prefix e2e-",
    )
    parser.add_argument(
        "--apply",
        action="store_true",
        help="Apply deletions (default is dry-run). Requires --confirm.",
    )
    parser.add_argument(
        "--confirm",
        default="",
        help="Required when using --apply. Must exactly equal the comma-separated prefix list.",
    )
    parser.add_argument(
        "--protect-email",
        action="append",
        default=[],
        help="Vaultwarden emails that must never be deleted (repeatable).",
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        help="List matched emails (and invitation emails).",
    )
    return parser.parse_args(argv)
 def main(argv: list[str]) -> int:
    args = _parse_args(argv)
    prefixes = _validate_prefixes(args.prefix)
    expected_confirm = ",".join(prefixes)
    protected = {e.strip() for e in args.protect_email if e.strip()}
    protected |= {
        "brad@bstein.dev",
        "edstein87@outlook.com",
        "indifox8@gmail.com",
        "mgs.stein@gmail.com",
        "patriot87@gmail.com",
    }
    if args.apply and args.confirm != expected_confirm:
        print(
            f"error: refusing to apply without --confirm '{expected_confirm}' (got '{args.confirm}')",
            file=sys.stderr,
        )
        return 2
    users = _list_users(prefixes, protected=protected)
    invites = _list_invitations(prefixes, protected=protected)
    print(f"prefixes: {expected_confirm}")
    print(f"mode: {'APPLY' if args.apply else 'DRY-RUN'}")
    if protected:
        print(f"protected emails: {', '.join(sorted(protected))}")
    print(f"vaultwarden users matched: {len(users)}")
    print(f"vaultwarden invitations matched: {len(invites)}")
    if args.verbose:
        for user in users[: min(100, len(users))]:
            print(f"  user: {user.email} (deps={user.dependent_rows})")
        if len(users) > 100:
            print(f"  ... and {len(users) - 100} more users")
        for email in invites[: min(100, len(invites))]:
            print(f"  invite: {email}")
        if len(invites) > 100:
            print(f"  ... and {len(invites) - 100} more invitations")
    unsafe = [u for u in users if u.dependent_rows > 0]
    if unsafe:
        print("refusing to delete users with dependent data:", file=sys.stderr)
        for user in unsafe[: min(50, len(unsafe))]:
            print(f"  - {user.email} deps={user.dependent_rows}", file=sys.stderr)
        if len(unsafe) > 50:
            print(f"  ... and {len(unsafe) - 50} more", file=sys.stderr)
        return 2
    if not args.apply:
        print("dry-run complete (no changes made)")
        return 0
    deleted_invites = _delete_invitations(invites)
    deleted_users = _delete_users([u.uuid for u in users])
    print(f"deleted vaultwarden invitations: {deleted_invites}")
    print(f"deleted vaultwarden users: {deleted_users}")
    print("done")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))
--- a/scripts/test_vaultwarden_user_cleanup.sh
+++ b/scripts/test_vaultwarden_user_cleanup.sh
@ -0,0 +1,15 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Manual-only helper to clean Vaultwarden test users and invites from Postgres.
 #
 # Usage (dry-run):
 #   scripts/test_vaultwarden_user_cleanup.sh --prefix e2e-
 #
 # Usage (apply):
 #   scripts/test_vaultwarden_user_cleanup.sh --prefix e2e- --apply --confirm e2e-
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
 python3 "${SCRIPT_DIR}/test_vaultwarden_user_cleanup.py" "$@"
--- a/scripts/tests/test_mailu_sync.py
+++ b/scripts/tests/test_mailu_sync.py
@ -20,7 +20,13 @@ def load_sync_module(monkeypatch):
    }
    for k, v in env.items():
        monkeypatch.setenv(k, v)
-    module_path = pathlib.Path(__file__).resolve().parents[1] / "mailu_sync.py"
+    module_path = (
        pathlib.Path(__file__).resolve().parents[2]
        / "services"
        / "mailu"
        / "scripts"
        / "mailu_sync.py"
    )
    spec = importlib.util.spec_from_file_location("mailu_sync_testmod", module_path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
@ -102,7 +108,8 @@ def test_kc_get_users_paginates(monkeypatch):
    sync.SESSION = _PagedSession()
    users = sync.kc_get_users("tok")
    assert [u["id"] for u in users] == ["u1", "u2"]
-    assert sync.SESSION.calls == 2
+    # Pagination stops when results < page size.
    assert sync.SESSION.calls == 1
 def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
@ -119,6 +126,7 @@ def test_ensure_mailu_user_skips_foreign_domain(monkeypatch):
 def test_ensure_mailu_user_upserts(monkeypatch):
    sync = load_sync_module(monkeypatch)
    monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
    captured = {}
    class _Cursor:
@ -134,6 +142,7 @@ def test_ensure_mailu_user_upserts(monkeypatch):
 def test_main_generates_password_and_upserts(monkeypatch):
    sync = load_sync_module(monkeypatch)
    monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
    users = [
        {"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
        {"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
@ -176,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):
    sync.main()
-    # Should attempt two inserts (third user skipped due to domain mismatch)
+    # Always backfill mailu_email, even if Keycloak recovery email is external.
-    assert len(updated) == 1  # only one missing attr was backfilled
+    assert len(updated) == 3
-    assert conns and len(conns[0]._cursor.executions) == 2
+    assert conns and len(conns[0]._cursor.executions) == 3
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@ -0,0 +1,105 @@
 # services/ai-llm/deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: ollama
  namespace: ai
 spec:
  replicas: 1
  revisionHistoryLimit: 2
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 0
      maxUnavailable: 1
  selector:
    matchLabels:
      app: ollama
  template:
    metadata:
      labels:
        app: ollama
      annotations:
        ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
        ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: kubernetes.io/hostname
                    operator: In
                    values:
                      - titan-20
                      - titan-21
                      - titan-22
                      - titan-24
      runtimeClassName: nvidia
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: ollama-models
      initContainers:
        - name: warm-model
          image: ollama/ollama:latest
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
              value: qwen2.5-coder:7b-instruct-q4_0
          command:
            - /bin/sh
            - -c
            - |
              set -e
              ollama serve >/tmp/ollama.log 2>&1 &
              sleep 6
              ollama pull "${OLLAMA_MODEL}"
              pkill ollama || true
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: 250m
              memory: 1Gi
              nvidia.com/gpu.shared: 1
            limits:
              nvidia.com/gpu.shared: 1
      containers:
        - name: ollama
          image: ollama/ollama:latest
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 11434
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: OLLAMA_KEEP_ALIVE
              value: 6h
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: "2"
              memory: 8Gi
              nvidia.com/gpu.shared: 1
            limits:
              cpu: "4"
              memory: 12Gi
              nvidia.com/gpu.shared: 1
--- a/services/ai-llm/kustomization.yaml
+++ b/services/ai-llm/kustomization.yaml
@ -0,0 +1,9 @@
 # services/ai-llm/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: ai
 resources:
  - namespace.yaml
  - pvc.yaml
  - deployment.yaml
  - service.yaml
--- a/services/ai-llm/namespace.yaml
+++ b/services/ai-llm/namespace.yaml
@ -0,0 +1,5 @@
 # services/ai-llm/namespace.yaml
 apiVersion: v1
 kind: Namespace
 metadata:
  name: ai
--- a/services/ai-llm/pvc.yaml
+++ b/services/ai-llm/pvc.yaml
@ -0,0 +1,13 @@
 # services/ai-llm/pvc.yaml
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: ollama-models
  namespace: ai
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 30Gi
  storageClassName: astreae
--- a/services/ai-llm/service.yaml
+++ b/services/ai-llm/service.yaml
@ -0,0 +1,14 @@
 # services/ai-llm/service.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: ollama
  namespace: ai
 spec:
  type: ClusterIP
  selector:
    app: ollama
  ports:
    - name: http
      port: 11434
      targetPort: 11434
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -5,7 +5,7 @@ metadata:
  name: bstein-dev-home-backend
  namespace: bstein-dev-home
 spec:
-  replicas: 2
+  replicas: 1
  revisionHistoryLimit: 3
  selector:
    matchLabels:
@ -15,6 +15,8 @@ spec:
      labels:
        app: bstein-dev-home-backend
    spec:
      automountServiceAccountToken: true
      serviceAccountName: bstein-dev-home
      nodeSelector:
        kubernetes.io/arch: arm64
        node-role.kubernetes.io/worker: "true"
@ -22,8 +24,73 @@ spec:
        - name: harbor-bstein-robot
      containers:
        - name: backend
-          image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest
+          image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
          imagePullPolicy: Always
          command: ["gunicorn"]
          args:
            - -b
            - 0.0.0.0:8080
            - --workers
            - "2"
            - --timeout
            - "180"
            - app:app
          env:
            - name: AI_CHAT_API
              value: http://ollama.ai.svc.cluster.local:11434
            - name: AI_CHAT_MODEL
              value: qwen2.5-coder:7b-instruct-q4_0
            - name: AI_CHAT_TIMEOUT_SEC
              value: "60"
            - name: AI_NODE_NAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
            - name: AI_NODE_GPU_MAP
              value: |
                {"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
            - name: KEYCLOAK_ENABLED
              value: "true"
            - name: KEYCLOAK_URL
              value: https://sso.bstein.dev
            - name: KEYCLOAK_REALM
              value: atlas
            - name: KEYCLOAK_CLIENT_ID
              value: bstein-dev-home
            - name: KEYCLOAK_ISSUER
              value: https://sso.bstein.dev/realms/atlas
            - name: KEYCLOAK_JWKS_URL
              value: http://keycloak.sso.svc.cluster.local/realms/atlas/protocol/openid-connect/certs
            - name: KEYCLOAK_ADMIN_URL
              value: http://keycloak.sso.svc.cluster.local
            - name: KEYCLOAK_ADMIN_REALM
              value: atlas
            - name: KEYCLOAK_ADMIN_CLIENT_ID
              value: bstein-dev-home-admin
            - name: KEYCLOAK_ADMIN_CLIENT_SECRET
              valueFrom:
                secretKeyRef:
                  name: bstein-dev-home-keycloak-admin
                  key: client_secret
            - name: ACCOUNT_ALLOWED_GROUPS
              value: ""
            - name: PORTAL_DATABASE_URL
              valueFrom:
                secretKeyRef:
                  name: atlas-portal-db
                  key: PORTAL_DATABASE_URL
            - name: HTTP_CHECK_TIMEOUT_SEC
              value: "2"
            - name: ACCESS_REQUEST_SUBMIT_RATE_LIMIT
              value: "30"
            - name: ACCESS_REQUEST_SUBMIT_RATE_WINDOW_SEC
              value: "3600"
            - name: ACCESS_REQUEST_STATUS_RATE_LIMIT
              value: "120"
            - name: ACCESS_REQUEST_STATUS_RATE_WINDOW_SEC
              value: "60"
            - name: ACCESS_REQUEST_INTERNAL_EMAIL_ALLOWLIST
              value: robotuser@bstein.dev
          ports:
            - name: http
              containerPort: 8080
@ -33,16 +100,18 @@ spec:
              port: http
            initialDelaySeconds: 2
            periodSeconds: 5
            timeoutSeconds: 3
          livenessProbe:
            httpGet:
              path: /api/healthz
              port: http
            initialDelaySeconds: 10
            periodSeconds: 10
            timeoutSeconds: 3
          resources:
            requests:
-              cpu: 50m
+              cpu: 100m
-              memory: 64Mi
+              memory: 128Mi
            limits:
-              cpu: 300m
+              cpu: 500m
-              memory: 256Mi
+              memory: 512Mi
--- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml
@ -0,0 +1,69 @@
 # services/bstein-dev-home/chat-ai-gateway-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: chat-ai-gateway
  namespace: bstein-dev-home
 spec:
  replicas: 1
  revisionHistoryLimit: 2
  selector:
    matchLabels:
      app: chat-ai-gateway
  template:
    metadata:
      labels:
        app: chat-ai-gateway
    spec:
      nodeSelector:
        kubernetes.io/arch: arm64
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: gateway
          image: python:3.11-slim
          command: ["/bin/sh","-c"]
          args:
            - python /app/gateway.py
          env:
            - name: UPSTREAM_URL
              value: http://bstein-dev-home-backend/api/chat
            - name: CHAT_KEY_MATRIX
              valueFrom:
                secretKeyRef:
                  name: chat-ai-keys-runtime
                  key: matrix
            - name: CHAT_KEY_HOMEPAGE
              valueFrom:
                secretKeyRef:
                  name: chat-ai-keys-runtime
                  key: homepage
          ports:
            - name: http
              containerPort: 8080
          readinessProbe:
            httpGet:
              path: /healthz
              port: http
            initialDelaySeconds: 2
            periodSeconds: 5
          livenessProbe:
            httpGet:
              path: /healthz
              port: http
            initialDelaySeconds: 10
            periodSeconds: 10
          resources:
            requests:
              cpu: 20m
              memory: 64Mi
            limits:
              cpu: 200m
              memory: 256Mi
          volumeMounts:
            - name: code
              mountPath: /app/gateway.py
              subPath: gateway.py
      volumes:
        - name: code
          configMap:
            name: chat-ai-gateway
--- a/services/bstein-dev-home/chat-ai-gateway-service.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-service.yaml
@ -0,0 +1,13 @@
 # services/bstein-dev-home/chat-ai-gateway-service.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: chat-ai-gateway
  namespace: bstein-dev-home
 spec:
  selector:
    app: chat-ai-gateway
  ports:
    - name: http
      port: 80
      targetPort: 8080
--- a/services/bstein-dev-home/frontend-deployment.yaml
+++ b/services/bstein-dev-home/frontend-deployment.yaml
@ -5,7 +5,7 @@ metadata:
  name: bstein-dev-home-frontend
  namespace: bstein-dev-home
 spec:
-  replicas: 2
+  replicas: 1
  revisionHistoryLimit: 3
  selector:
    matchLabels:
@ -22,7 +22,7 @@ spec:
        - name: harbor-bstein-robot
      containers:
        - name: frontend
-          image: registry.bstein.dev/bstein/bstein-dev-home-frontend:latest
+          image: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
          imagePullPolicy: Always
          ports:
            - name: http
--- a/services/bstein-dev-home/ingress.yaml
+++ b/services/bstein-dev-home/ingress.yaml
@ -11,7 +11,7 @@ metadata:
    cert-manager.io/cluster-issuer: letsencrypt
 spec:
  tls:
-    - hosts: [ "bstein.dev" ]
+    - hosts: [ "bstein.dev", "chat.ai.bstein.dev" ]
      secretName: bstein-dev-home-tls
  rules:
    - host: bstein.dev
@ -29,3 +29,12 @@ spec:
              service:
                name: bstein-dev-home-frontend
                port: { number: 80 }
    - host: chat.ai.bstein.dev
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: chat-ai-gateway
                port: { number: 80 }
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@ -5,13 +5,38 @@ namespace: bstein-dev-home
 resources:
  - namespace.yaml
  - image.yaml
  - rbac.yaml
  - portal-e2e-client-secret-sync-rbac.yaml
  - chat-ai-gateway-deployment.yaml
  - chat-ai-gateway-service.yaml
  - frontend-deployment.yaml
  - frontend-service.yaml
  - backend-deployment.yaml
  - backend-service.yaml
  - vaultwarden-cred-sync-cronjob.yaml
  - portal-onboarding-e2e-test-job.yaml
  - ingress.yaml
 images:
  - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
+    newTag: registry.bstein.dev/bstein/bstein-dev-home-frontend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend"}
  - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-0 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
+    newTag: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
 configMapGenerator:
  - name: chat-ai-gateway
    namespace: bstein-dev-home
    files:
      - gateway.py=scripts/gateway.py
    options:
      disableNameSuffixHash: true
  - name: vaultwarden-cred-sync-script
    namespace: bstein-dev-home
    files:
      - vaultwarden_cred_sync.py=scripts/vaultwarden_cred_sync.py
    options:
      disableNameSuffixHash: true
  - name: portal-onboarding-e2e-tests
    namespace: bstein-dev-home
    files:
      - test_portal_onboarding_flow.py=scripts/test_portal_onboarding_flow.py
    options:
      disableNameSuffixHash: true
--- a/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
+++ b/services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
@ -0,0 +1,24 @@
 # services/bstein-dev-home/portal-e2e-client-secret-sync-rbac.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: portal-e2e-client-secret-sync-target
  namespace: bstein-dev-home
 rules:
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["get", "create", "patch", "update"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: portal-e2e-client-secret-sync-target
  namespace: bstein-dev-home
 subjects:
  - kind: ServiceAccount
    name: portal-e2e-client-secret-sync
    namespace: sso
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: portal-e2e-client-secret-sync-target
--- a/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
+++ b/services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
@ -0,0 +1,66 @@
 # services/bstein-dev-home/portal-onboarding-e2e-test-job.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: portal-onboarding-e2e-test-11
  namespace: bstein-dev-home
 spec:
  backoffLimit: 0
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: test
          image: python:3.11-slim
          env:
            - name: PORTAL_BASE_URL
              value: http://bstein-dev-home-backend.bstein-dev-home.svc.cluster.local
            - name: KEYCLOAK_ADMIN_URL
              value: https://sso.bstein.dev
            - name: KEYCLOAK_REALM
              value: atlas
            - name: KEYCLOAK_ADMIN_CLIENT_ID
              value: bstein-dev-home-admin
            - name: KEYCLOAK_ADMIN_CLIENT_SECRET
              valueFrom:
                secretKeyRef:
                  name: bstein-dev-home-keycloak-admin
                  key: client_secret
            - name: PORTAL_E2E_CLIENT_ID
              valueFrom:
                secretKeyRef:
                  name: portal-e2e-client
                  key: client_id
            - name: PORTAL_E2E_CLIENT_SECRET
              valueFrom:
                secretKeyRef:
                  name: portal-e2e-client
                  key: client_secret
            - name: PORTAL_TARGET_CLIENT_ID
              value: bstein-dev-home
            - name: E2E_PORTAL_ADMIN_USERNAME
              value: bstein
            - name: E2E_USERNAME_PREFIX
              value: e2e-portal
            - name: E2E_CONTACT_EMAIL
              value: robotuser@bstein.dev
            - name: E2E_IMAP_KEYCLOAK_USERNAME
              value: robotuser
            - name: E2E_DEADLINE_SECONDS
              value: "600"
            - name: E2E_POLL_SECONDS
              value: "10"
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -euo pipefail
              python /scripts/test_portal_onboarding_flow.py
          volumeMounts:
            - name: tests
              mountPath: /scripts
              readOnly: true
      volumes:
        - name: tests
          configMap:
            name: portal-onboarding-e2e-tests
            defaultMode: 0555
--- a/services/bstein-dev-home/rbac.yaml
+++ b/services/bstein-dev-home/rbac.yaml
@ -0,0 +1,108 @@
 # services/bstein-dev-home/rbac.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: bstein-dev-home
  namespace: bstein-dev-home
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: bstein-dev-home-ai-reader
 rules:
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch"]
    resourceNames: []
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: bstein-dev-home-ai-reader
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: bstein-dev-home-ai-reader
 subjects:
  - kind: ServiceAccount
    name: bstein-dev-home
    namespace: bstein-dev-home
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: bstein-dev-home-vaultwarden-admin-secret-reader
 rules:
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["get"]
    resourceNames: ["vaultwarden-admin"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: bstein-dev-home-vaultwarden-admin-secret-reader
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: bstein-dev-home-vaultwarden-admin-secret-reader
 subjects:
  - kind: ServiceAccount
    name: bstein-dev-home
    namespace: bstein-dev-home
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: bstein-dev-home-vaultwarden-admin-token-reader
  namespace: vaultwarden
 rules:
  - apiGroups: [""]
    resources: ["secrets"]
    verbs: ["get"]
    resourceNames: ["vaultwarden-admin"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: bstein-dev-home-vaultwarden-admin-token-reader
  namespace: vaultwarden
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: bstein-dev-home-vaultwarden-admin-token-reader
 subjects:
  - kind: ServiceAccount
    name: bstein-dev-home
    namespace: bstein-dev-home
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: bstein-dev-home-nextcloud-mail-sync
  namespace: nextcloud
 rules:
  - apiGroups: ["batch"]
    resources: ["cronjobs"]
    verbs: ["get"]
    resourceNames: ["nextcloud-mail-sync"]
  - apiGroups: ["batch"]
    resources: ["jobs"]
    verbs: ["create", "get", "list", "watch"]
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: bstein-dev-home-nextcloud-mail-sync
  namespace: nextcloud
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: bstein-dev-home-nextcloud-mail-sync
 subjects:
  - kind: ServiceAccount
    name: bstein-dev-home
    namespace: bstein-dev-home
--- a/services/bstein-dev-home/scripts/gateway.py
+++ b/services/bstein-dev-home/scripts/gateway.py
@ -0,0 +1,70 @@
 import json
 import os
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from urllib import request, error
 UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
 KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
 KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
 ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
 class Handler(BaseHTTPRequestHandler):
    def _send_json(self, code: int, payload: dict):
        body = json.dumps(payload).encode()
        self.send_response(code)
        self.send_header("Content-Type", "application/json")
        self.send_header("Content-Length", str(len(body)))
        self.end_headers()
        self.wfile.write(body)
    def do_GET(self):  # noqa: N802
        if self.path in ("/healthz", "/"):
            return self._send_json(200, {"ok": True})
        return self._send_json(404, {"error": "not_found"})
    def do_POST(self):  # noqa: N802
        if self.path != "/":
            return self._send_json(404, {"error": "not_found"})
        key = self.headers.get("x-api-key", "")
        if not key or key not in ALLOWED:
            return self._send_json(401, {"error": "unauthorized"})
        length = int(self.headers.get("content-length", "0") or "0")
        raw = self.rfile.read(length) if length else b"{}"
        try:
            upstream_req = request.Request(
                UPSTREAM,
                data=raw,
                headers={"Content-Type": "application/json"},
                method="POST",
            )
            with request.urlopen(upstream_req, timeout=90) as resp:
                data = resp.read()
                self.send_response(resp.status)
                for k, v in resp.headers.items():
                    if k.lower() in ("content-length", "connection", "server", "date"):
                        continue
                    self.send_header(k, v)
                self.send_header("Content-Length", str(len(data)))
                self.end_headers()
                self.wfile.write(data)
        except error.HTTPError as e:
            data = e.read() if hasattr(e, "read") else b""
            self.send_response(e.code)
            self.send_header("Content-Type", "application/json")
            self.send_header("Content-Length", str(len(data)))
            self.end_headers()
            self.wfile.write(data)
        except Exception:
            return self._send_json(502, {"error": "bad_gateway"})
 def main():
    port = int(os.environ.get("PORT", "8080"))
    httpd = HTTPServer(("0.0.0.0", port), Handler)
    httpd.serve_forever()
 if __name__ == "__main__":
    main()
--- a/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
+++ b/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
@ -0,0 +1,428 @@
 #!/usr/bin/env python3
 import email
 import http.client
 import imaplib
 import json
 import os
 import re
 import ssl
 import sys
 import time
 import urllib.error
 import urllib.parse
 import urllib.request
 def _env(name: str, default: str | None = None) -> str:
    value = os.environ.get(name, default)
    if value is None or value == "":
        raise SystemExit(f"missing required env var: {name}")
    return value
 def _post_json(url: str, payload: dict, timeout_s: int = 30) -> dict:
    body = json.dumps(payload).encode()
    req = urllib.request.Request(
        url,
        data=body,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
            raw = resp.read().decode()
            return json.loads(raw) if raw else {}
    except urllib.error.HTTPError as exc:
        raw = exc.read().decode(errors="replace")
        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
 def _post_form(url: str, data: dict[str, str], timeout_s: int = 30) -> dict:
    body = urllib.parse.urlencode(data).encode()
    req = urllib.request.Request(
        url,
        data=body,
        headers={"Content-Type": "application/x-www-form-urlencoded"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
            raw = resp.read().decode()
            return json.loads(raw) if raw else {}
    except urllib.error.HTTPError as exc:
        raw = exc.read().decode(errors="replace")
        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
 def _get_json(url: str, headers: dict[str, str] | None = None, timeout_s: int = 30) -> object:
    req = urllib.request.Request(url, headers=headers or {}, method="GET")
    try:
        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
            raw = resp.read().decode()
            return json.loads(raw) if raw else None
    except urllib.error.HTTPError as exc:
        raw = exc.read().decode(errors="replace")
        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
 def _request_json(
    method: str,
    url: str,
    token: str,
    payload: dict | None = None,
    timeout_s: int = 30,
 ) -> dict:
    data = None
    headers = {"Authorization": f"Bearer {token}"}
    if payload is not None:
        data = json.dumps(payload).encode()
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, data=data, headers=headers, method=method)
    try:
        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
            raw = resp.read().decode()
            return json.loads(raw) if raw else {}
    except urllib.error.HTTPError as exc:
        raw = exc.read().decode(errors="replace")
        raise SystemExit(f"HTTP {exc.code} from {url}: {raw}")
 def _keycloak_client_token(keycloak_base: str, realm: str, client_id: str, client_secret: str) -> str:
    token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
    payload = _post_form(
        token_url,
        {
            "grant_type": "client_credentials",
            "client_id": client_id,
            "client_secret": client_secret,
        },
        timeout_s=20,
    )
    token = payload.get("access_token")
    if not isinstance(token, str) or not token:
        raise SystemExit("keycloak token response missing access_token")
    return token
 def _keycloak_token_exchange(
    *,
    keycloak_base: str,
    realm: str,
    client_id: str,
    client_secret: str,
    subject_token: str,
    requested_subject: str,
    audience: str,
 ) -> str:
    token_url = f"{keycloak_base.rstrip('/')}/realms/{realm}/protocol/openid-connect/token"
    payload = _post_form(
        token_url,
        {
            "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
            "client_id": client_id,
            "client_secret": client_secret,
            "subject_token": subject_token,
            "requested_subject": requested_subject,
            "audience": audience,
        },
        timeout_s=20,
    )
    token = payload.get("access_token")
    if not isinstance(token, str) or not token:
        raise SystemExit("keycloak token exchange response missing access_token")
    return token
 def _keycloak_find_user(keycloak_base: str, realm: str, token: str, username: str) -> dict | None:
    url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users?{urllib.parse.urlencode({'username': username, 'exact': 'true', 'max': '1'})}"
    users = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
    if not isinstance(users, list) or not users:
        return None
    user = users[0]
    return user if isinstance(user, dict) else None
 def _keycloak_get_user(keycloak_base: str, realm: str, token: str, user_id: str) -> dict:
    url = f"{keycloak_base.rstrip('/')}/admin/realms/{realm}/users/{urllib.parse.quote(user_id, safe='')}"
    data = _get_json(url, headers={"Authorization": f"Bearer {token}"}, timeout_s=20)
    if not isinstance(data, dict):
        raise SystemExit("unexpected keycloak user payload")
    return data
 def _extract_attr(attributes: object, key: str) -> str:
    if not isinstance(attributes, dict):
        return ""
    value = attributes.get(key)
    if isinstance(value, list) and value and isinstance(value[0], str):
        return value[0]
    if isinstance(value, str):
        return value
    return ""
 def _imap_wait_for_verify_token(
    *,
    host: str,
    port: int,
    username: str,
    password: str,
    request_code: str,
    deadline_sec: int,
 ) -> str:
    ssl_context = ssl._create_unverified_context()
    deadline_at = time.monotonic() + deadline_sec
    with imaplib.IMAP4_SSL(host, port, ssl_context=ssl_context) as client:
        client.login(username, password)
        client.select("INBOX")
        while time.monotonic() < deadline_at:
            status, data = client.search(None, "TEXT", request_code)
            if status == "OK" and data and data[0]:
                ids = data[0].split()
                msg_id = ids[-1]
                fetch_status, msg_data = client.fetch(msg_id, "(RFC822)")
                if fetch_status != "OK" or not msg_data:
                    time.sleep(2)
                    continue
                raw = msg_data[0][1] if isinstance(msg_data[0], tuple) and len(msg_data[0]) > 1 else None
                if not isinstance(raw, (bytes, bytearray)):
                    time.sleep(2)
                    continue
                message = email.message_from_bytes(raw)
                body = None
                if message.is_multipart():
                    for part in message.walk():
                        if part.get_content_type() == "text/plain":
                            payload = part.get_payload(decode=True)
                            if isinstance(payload, (bytes, bytearray)):
                                body = payload.decode(errors="replace")
                                break
                else:
                    payload = message.get_payload(decode=True)
                    if isinstance(payload, (bytes, bytearray)):
                        body = payload.decode(errors="replace")
                if not body:
                    time.sleep(2)
                    continue
                url = None
                for line in body.splitlines():
                    candidate = line.strip()
                    if "verify=" in candidate and candidate.startswith("http"):
                        url = candidate
                        break
                if not url:
                    match = re.search(r"https?://\\S+verify=\\S+", body)
                    url = match.group(0) if match else None
                if not url:
                    time.sleep(2)
                    continue
                parsed = urllib.parse.urlparse(url)
                query = urllib.parse.parse_qs(parsed.query)
                token = query.get("verify", [""])[0]
                if isinstance(token, str) and token:
                    return token
            time.sleep(2)
    raise SystemExit("verification email not found before deadline")
 def main() -> int:
    portal_base = _env("PORTAL_BASE_URL").rstrip("/")
    keycloak_base = _env("KEYCLOAK_ADMIN_URL").rstrip("/")
    realm = _env("KEYCLOAK_REALM", "atlas")
    kc_admin_client_id = _env("KEYCLOAK_ADMIN_CLIENT_ID")
    kc_admin_client_secret = _env("KEYCLOAK_ADMIN_CLIENT_SECRET")
    portal_e2e_client_id = _env("PORTAL_E2E_CLIENT_ID")
    portal_e2e_client_secret = _env("PORTAL_E2E_CLIENT_SECRET")
    portal_target_client_id = os.environ.get("PORTAL_TARGET_CLIENT_ID", "bstein-dev-home").strip() or "bstein-dev-home"
    portal_admin_username = os.environ.get("E2E_PORTAL_ADMIN_USERNAME", "bstein").strip() or "bstein"
    contact_email = os.environ.get("E2E_CONTACT_EMAIL", "robotuser@bstein.dev").strip()
    if not contact_email:
        raise SystemExit("E2E_CONTACT_EMAIL must not be empty")
    imap_host = os.environ.get("E2E_IMAP_HOST", "mailu-front.mailu-mailserver.svc.cluster.local").strip()
    imap_port = int(os.environ.get("E2E_IMAP_PORT", "993"))
    imap_keycloak_username = os.environ.get("E2E_IMAP_KEYCLOAK_USERNAME", "robotuser").strip()
    imap_wait_sec = int(os.environ.get("E2E_IMAP_WAIT_SECONDS", "90"))
    try:
        token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
    except SystemExit as exc:
        raise SystemExit(f"failed to fetch keycloak token for admin client {kc_admin_client_id!r}: {exc}")
    mailbox_user = _keycloak_find_user(keycloak_base, realm, token, imap_keycloak_username)
    if not mailbox_user:
        raise SystemExit(f"unable to locate Keycloak mailbox user {imap_keycloak_username!r}")
    mailbox_user_id = mailbox_user.get("id")
    if not isinstance(mailbox_user_id, str) or not mailbox_user_id:
        raise SystemExit("mailbox user missing id")
    mailbox_full = _keycloak_get_user(keycloak_base, realm, token, mailbox_user_id)
    mailbox_attrs = mailbox_full.get("attributes")
    mailu_email = _extract_attr(mailbox_attrs, "mailu_email")
    if not mailu_email:
        mailu_email = contact_email
    mailu_password = _extract_attr(mailbox_attrs, "mailu_app_password")
    if not mailu_password:
        raise SystemExit(f"Keycloak user {imap_keycloak_username!r} missing mailu_app_password attribute")
    username_prefix = os.environ.get("E2E_USERNAME_PREFIX", "e2e-user")
    now = int(time.time())
    username = f"{username_prefix}-{now}"
    submit_url = f"{portal_base}/api/access/request"
    submit_payload = {"username": username, "email": contact_email, "note": "portal onboarding e2e"}
    submit = None
    for attempt in range(1, 6):
        try:
            submit = _post_json(submit_url, submit_payload, timeout_s=20)
            break
        except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
            if attempt == 5:
                raise SystemExit(f"portal submit failed after {attempt} attempts: {exc}")
            time.sleep(2)
    if not isinstance(submit, dict):
        raise SystemExit("portal submit did not return json")
    request_code = submit.get("request_code")
    if not isinstance(request_code, str) or not request_code:
        raise SystemExit(f"request submit did not return request_code: {submit}")
    verify_token = _imap_wait_for_verify_token(
        host=imap_host,
        port=imap_port,
        username=mailu_email,
        password=mailu_password,
        request_code=request_code,
        deadline_sec=imap_wait_sec,
    )
    verify_resp = _post_json(
        f"{portal_base}/api/access/request/verify",
        {"request_code": request_code, "token": verify_token},
        timeout_s=30,
    )
    if not isinstance(verify_resp, dict) or verify_resp.get("ok") is not True:
        raise SystemExit(f"unexpected verify response: {verify_resp}")
    portal_admin = _keycloak_find_user(keycloak_base, realm, token, portal_admin_username)
    if not portal_admin:
        raise SystemExit(f"unable to locate portal admin user {portal_admin_username!r} via Keycloak admin API")
    portal_admin_user_id = portal_admin.get("id")
    if not isinstance(portal_admin_user_id, str) or not portal_admin_user_id:
        raise SystemExit("portal admin user missing id")
    try:
        e2e_subject_token = _keycloak_client_token(keycloak_base, realm, portal_e2e_client_id, portal_e2e_client_secret)
    except SystemExit as exc:
        raise SystemExit(f"failed to fetch keycloak token for E2E client {portal_e2e_client_id!r}: {exc}")
    try:
        portal_bearer = _keycloak_token_exchange(
            keycloak_base=keycloak_base,
            realm=realm,
            client_id=portal_e2e_client_id,
            client_secret=portal_e2e_client_secret,
            subject_token=e2e_subject_token,
            requested_subject=portal_admin_user_id,
            audience=portal_target_client_id,
        )
    except SystemExit as exc:
        raise SystemExit(f"failed to exchange token for portal approval as {portal_admin_username!r}: {exc}")
    approve_url = f"{portal_base}/api/admin/access/requests/{urllib.parse.quote(username, safe='')}/approve"
    approve_timeout_s = int(os.environ.get("E2E_APPROVE_TIMEOUT_SECONDS", "180"))
    approve_attempts = int(os.environ.get("E2E_APPROVE_ATTEMPTS", "3"))
    approve_resp = None
    approve_error = None
    for attempt in range(1, approve_attempts + 1):
        try:
            approve_resp = _request_json("POST", approve_url, portal_bearer, payload=None, timeout_s=approve_timeout_s)
            approve_error = None
            break
        except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
            approve_error = str(exc)
            if attempt == approve_attempts:
                break
            time.sleep(3)
    if approve_resp is None:
        print(
            "WARNING: portal approval request did not return a response; "
            f"continuing to poll status (last_error={approve_error})"
        )
    elif not isinstance(approve_resp, dict) or approve_resp.get("ok") is not True:
        raise SystemExit(f"unexpected approval response: {approve_resp}")
    status_url = f"{portal_base}/api/access/request/status"
    deadline_s = int(os.environ.get("E2E_DEADLINE_SECONDS", "600"))
    interval_s = int(os.environ.get("E2E_POLL_SECONDS", "10"))
    deadline_at = time.monotonic() + deadline_s
    last_status = None
    last_error = None
    while True:
        try:
            status_payload = _post_json(status_url, {"request_code": request_code}, timeout_s=60)
            last_error = None
        except (http.client.RemoteDisconnected, TimeoutError, urllib.error.URLError) as exc:
            last_error = str(exc)
            if time.monotonic() >= deadline_at:
                raise SystemExit(f"timed out waiting for provisioning to finish (last error={last_error})")
            time.sleep(interval_s)
            continue
        status = status_payload.get("status")
        if isinstance(status, str):
            last_status = status
        if status in ("awaiting_onboarding", "ready"):
            break
        if status in ("denied", "unknown"):
            raise SystemExit(f"request transitioned to unexpected terminal status: {status_payload}")
        if time.monotonic() >= deadline_at:
            suffix = f" (last error={last_error})" if last_error else ""
            raise SystemExit(f"timed out waiting for provisioning to finish (last status={last_status}){suffix}")
        time.sleep(interval_s)
    # Refresh admin token (it may expire during the provisioning wait).
    token = _keycloak_client_token(keycloak_base, realm, kc_admin_client_id, kc_admin_client_secret)
    user = _keycloak_find_user(keycloak_base, realm, token, username)
    if not user:
        raise SystemExit("expected Keycloak user was not created")
    user_id = user.get("id")
    if not isinstance(user_id, str) or not user_id:
        raise SystemExit("created user missing id")
    full = _keycloak_get_user(keycloak_base, realm, token, user_id)
    required_actions = full.get("requiredActions") or []
    required: set[str] = set()
    if isinstance(required_actions, list):
        required = {a for a in required_actions if isinstance(a, str)}
    unexpected = sorted(required.intersection({"UPDATE_PASSWORD", "VERIFY_EMAIL", "CONFIGURE_TOTP"}))
    if unexpected:
        raise SystemExit(
            "Keycloak user should not require actions at first login "
            f"(Vaultwarden-first onboarding): unexpected requiredActions={unexpected} full={sorted(required)}"
        )
    email_verified = full.get("emailVerified")
    if email_verified is not True:
        raise SystemExit(f"Keycloak user should have emailVerified=true: emailVerified={email_verified!r}")
    kc_email = full.get("email")
    if isinstance(kc_email, str) and contact_email and kc_email != contact_email:
        raise SystemExit(f"Keycloak user email mismatch: expected {contact_email!r} got {kc_email!r}")
    print(f"PASS: onboarding provisioning completed for {request_code} ({username})")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
+++ b/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
@ -0,0 +1,193 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import sys
 import time
 from typing import Any, Iterable
 import httpx
 from atlas_portal import settings
 from atlas_portal.keycloak import admin_client
 from atlas_portal.vaultwarden import invite_user
 VAULTWARDEN_EMAIL_ATTR = "vaultwarden_email"
 VAULTWARDEN_STATUS_ATTR = "vaultwarden_status"
 VAULTWARDEN_SYNCED_AT_ATTR = "vaultwarden_synced_at"
 def _iter_keycloak_users(page_size: int = 200) -> Iterable[dict[str, Any]]:
    client = admin_client()
    if not client.ready():
        raise RuntimeError("keycloak admin client not configured")
    url = f"{settings.KEYCLOAK_ADMIN_URL}/admin/realms/{settings.KEYCLOAK_REALM}/users"
    first = 0
    while True:
        headers = client.headers()
        # We need attributes for idempotency (vaultwarden_status/vaultwarden_email). Keycloak defaults to a
        # brief representation which may omit these.
        params = {"first": str(first), "max": str(page_size), "briefRepresentation": "false"}
        with httpx.Client(timeout=settings.HTTP_CHECK_TIMEOUT_SEC) as http:
            resp = http.get(url, params=params, headers=headers)
            resp.raise_for_status()
            payload = resp.json()
        if not isinstance(payload, list) or not payload:
            return
        for item in payload:
            if isinstance(item, dict):
                yield item
        if len(payload) < page_size:
            return
        first += page_size
 def _extract_attr(attrs: Any, key: str) -> str:
    if not isinstance(attrs, dict):
        return ""
    raw = attrs.get(key)
    if isinstance(raw, list):
        for item in raw:
            if isinstance(item, str) and item.strip():
                return item.strip()
        return ""
    if isinstance(raw, str) and raw.strip():
        return raw.strip()
    return ""
 def _vaultwarden_email_for_user(user: dict[str, Any]) -> str:
    username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
    username = username.strip()
    if not username:
        return ""
    attrs = user.get("attributes")
    vaultwarden_email = _extract_attr(attrs, VAULTWARDEN_EMAIL_ATTR)
    if vaultwarden_email:
        return vaultwarden_email
    mailu_email = _extract_attr(attrs, "mailu_email")
    if mailu_email:
        return mailu_email
    email = (user.get("email") if isinstance(user.get("email"), str) else "") or ""
    email = email.strip()
    if email and email.lower().endswith(f"@{settings.MAILU_DOMAIN.lower()}"):
        return email
    # Don't guess an internal mailbox address until Mailu sync has run and stored mailu_email.
    # This avoids spamming Vaultwarden invites that can never be delivered (unknown recipient).
    return ""
 def _set_user_attribute_if_missing(username: str, user: dict[str, Any], key: str, value: str) -> None:
    value = (value or "").strip()
    if not value:
        return
    existing = _extract_attr(user.get("attributes"), key)
    if existing:
        return
    admin_client().set_user_attribute(username, key, value)
 def _set_user_attribute(username: str, key: str, value: str) -> None:
    value = (value or "").strip()
    if not value:
        return
    admin_client().set_user_attribute(username, key, value)
 def main() -> int:
    processed = 0
    created = 0
    skipped = 0
    failures = 0
    for user in _iter_keycloak_users():
        username = (user.get("username") if isinstance(user.get("username"), str) else "") or ""
        username = username.strip()
        if not username:
            skipped += 1
            continue
        enabled = user.get("enabled")
        if enabled is False:
            skipped += 1
            continue
        if user.get("serviceAccountClientId") or username.startswith("service-account-"):
            skipped += 1
            continue
        # Fetch the full user payload so we can reliably read attributes (and skip re-invites).
        user_id = (user.get("id") if isinstance(user.get("id"), str) else "") or ""
        user_id = user_id.strip()
        full_user = user
        if user_id:
            try:
                full_user = admin_client().get_user(user_id)
            except Exception:
                full_user = user
        current_status = _extract_attr(full_user.get("attributes"), VAULTWARDEN_STATUS_ATTR)
        current_synced_at = _extract_attr(full_user.get("attributes"), VAULTWARDEN_SYNCED_AT_ATTR)
        email = _vaultwarden_email_for_user(full_user)
        if not email:
            print(f"skip {username}: missing email", file=sys.stderr)
            skipped += 1
            continue
        try:
            _set_user_attribute_if_missing(username, full_user, VAULTWARDEN_EMAIL_ATTR, email)
        except Exception:
            pass
        # If we've already successfully invited or confirmed presence, do not re-invite on every cron run.
        # Vaultwarden returns 409 for "already exists", which is idempotent but noisy and can trigger rate limits.
        if current_status in {"invited", "already_present"}:
            if not current_synced_at:
                try:
                    _set_user_attribute(
                        username,
                        VAULTWARDEN_SYNCED_AT_ATTR,
                        time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                    )
                except Exception:
                    pass
            skipped += 1
            continue
        processed += 1
        result = invite_user(email)
        if result.ok:
            created += 1
            print(f"ok {username}: {result.status}")
            try:
                _set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
                _set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
            except Exception:
                pass
        else:
            failures += 1
            print(f"err {username}: {result.status} {result.detail}", file=sys.stderr)
            try:
                _set_user_attribute(username, VAULTWARDEN_STATUS_ATTR, result.status)
                _set_user_attribute(username, VAULTWARDEN_SYNCED_AT_ATTR, time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
            except Exception:
                pass
    print(
        f"done processed={processed} created_or_present={created} skipped={skipped} failures={failures}",
        file=sys.stderr,
    )
    return 0 if failures == 0 else 2
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
+++ b/services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
@ -0,0 +1,59 @@
 # services/bstein-dev-home/vaultwarden-cred-sync-cronjob.yaml
 apiVersion: batch/v1
 kind: CronJob
 metadata:
  name: vaultwarden-cred-sync
  namespace: bstein-dev-home
 spec:
  schedule: "*/15 * * * *"
  concurrencyPolicy: Forbid
  successfulJobsHistoryLimit: 1
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      backoffLimit: 0
      template:
        spec:
          serviceAccountName: bstein-dev-home
          restartPolicy: Never
          nodeSelector:
            kubernetes.io/arch: arm64
            node-role.kubernetes.io/worker: "true"
          imagePullSecrets:
            - name: harbor-bstein-robot
          containers:
            - name: sync
              image: registry.bstein.dev/bstein/bstein-dev-home-backend:0.1.1-92 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend"}
              imagePullPolicy: Always
              command:
                - python
                - /scripts/vaultwarden_cred_sync.py
              env:
                - name: PYTHONPATH
                  value: /app
                - name: KEYCLOAK_ENABLED
                  value: "true"
                - name: KEYCLOAK_REALM
                  value: atlas
                - name: KEYCLOAK_ADMIN_URL
                  value: http://keycloak.sso.svc.cluster.local
                - name: KEYCLOAK_ADMIN_REALM
                  value: atlas
                - name: KEYCLOAK_ADMIN_CLIENT_ID
                  value: bstein-dev-home-admin
                - name: KEYCLOAK_ADMIN_CLIENT_SECRET
                  valueFrom:
                    secretKeyRef:
                      name: bstein-dev-home-keycloak-admin
                      key: client_secret
                - name: HTTP_CHECK_TIMEOUT_SEC
                  value: "20"
              volumeMounts:
                - name: vaultwarden-cred-sync-script
                  mountPath: /scripts
                  readOnly: true
          volumes:
            - name: vaultwarden-cred-sync-script
              configMap:
                name: vaultwarden-cred-sync-script
                defaultMode: 0555
--- a/services/ci-demo/deployment.yaml
+++ b/services/ci-demo/deployment.yaml
@ -1,31 +0,0 @@
 # services/ci-demo/deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: ci-demo
  namespace: ci-demo
 spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: ci-demo
  template:
    metadata:
      labels:
        app.kubernetes.io/name: ci-demo
    spec:
      nodeSelector:
        hardware: rpi4
      containers:
        - name: ci-demo
          image: registry.bstein.dev/infra/ci-demo:latest
          ports:
            - name: http
              containerPort: 8080
          readinessProbe:
            httpGet:
              path: /
              port: http
            initialDelaySeconds: 2
            periodSeconds: 5
--- a/services/ci-demo/image.yaml
+++ b/services/ci-demo/image.yaml
@ -1,24 +0,0 @@
 # services/ci-demo/image.yaml
 apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageRepository
 metadata:
  name: ci-demo
  namespace: flux-system
 spec:
  image: registry.bstein.dev/infra/ci-demo
  interval: 1m0s
 ---
 apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImagePolicy
 metadata:
  name: ci-demo
  namespace: flux-system
 spec:
  imageRepositoryRef:
    name: ci-demo
  filterTags:
    pattern: '^v(?P<version>0\.0\.0-\d+)$'
    extract: '$version'
  policy:
    semver:
      range: ">=0.0.0-0"
--- a/services/ci-demo/kustomization.yaml
+++ b/services/ci-demo/kustomization.yaml
@ -1,11 +0,0 @@
 # services/ci-demo/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - image.yaml
  - deployment.yaml
  - service.yaml
 images:
  - name: registry.bstein.dev/infra/ci-demo
    newTag: registry.bstein.dev/infra/ci-demo:v0.0.0-3 # {"$imagepolicy": "flux-system:ci-demo"}
--- a/services/ci-demo/namespace.yaml
+++ b/services/ci-demo/namespace.yaml
@ -1,6 +0,0 @@
 # services/ci-demo/namespace.yaml
 apiVersion: v1
 kind: Namespace
 metadata:
  name: ci-demo
--- a/services/comms/NOTES.md
+++ b/services/comms/NOTES.md
@ -0,0 +1,31 @@
 # services/comms/NOTES.md
 Purpose: Matrix + Element + LiveKit stack for Othrys (live.bstein.dev).
 Core flow
 - Matrix Authentication Service (MAS) handles login/SSO and issues Matrix access tokens.
 - Synapse is the homeserver; MAS fronts login, Synapse serves client/server APIs.
 - Element Web provides the main UI; Element Call embeds LiveKit for group video.
 - LiveKit handles SFU media; Coturn provides TURN for NAT traversal.
 - matrix-guest-register provisions MAS guest accounts and performs MAS password login to mint device-bound guest tokens (no Keycloak).
 Operational jobs
 - mas-db-ensure-job: ensures MAS database role/database + secret in comms.
 - comms-secrets-ensure-job: creates runtime secrets (TURN, LiveKit, Synapse, atlasbot).
 - synapse-signingkey-ensure-job: ensures Synapse signing key secret.
 - synapse-seeder-admin-ensure-job: ensures Synapse admin user exists.
 - synapse-user-seed-job: seeds atlasbot + othrys-seeder users/passwords.
 - mas-local-users-ensure-job: ensures MAS local users exist (seeder/bot).
 - seed-othrys-room: (suspended) creates Othrys + joins locals.
 - reset-othrys-room: suspended CronJob for a manual room reset + pin invite.
 - pin-othrys-invite: (suspended) pin invite message if missing.
 - guest-name-randomizer: renames numeric/guest users to adj-noun names.
 - bstein-force-leave: one-off room leave cleanup.
 Manual re-runs
 - Unsuspend a CronJob only when needed; re-suspend after completion.
 Ports
 - Traefik (HTTPS) via LB on 192.168.22.9.
 - Coturn LB on 192.168.22.5 (3478/5349 + UDP range).
 - LiveKit LB on 192.168.22.6 (7880/7881/7882/7883).
--- a/Show More
+++ b/Show More