76 changed files with 199 additions and 6990 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,6 @@
 !README.md
 !knowledge/**/*.md
 !services/comms/knowledge/**/*.md
-!services/atlasbot/knowledge/**/*.md
 __pycache__/
 *.py[cod]
 .pytest_cache
--- a/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
@ -1,26 +0,0 @@
-# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImageUpdateAutomation
-metadata:
-  name: atlasbot
-  namespace: ai
-spec:
-  interval: 1m0s
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  git:
-    checkout:
-      ref:
-        branch: feature/atlasbot
-    commit:
-      author:
-        email: ops@bstein.dev
-        name: flux-bot
-      messageTemplate: "chore(atlasbot): automated image update"
-    push:
-      branch: feature/atlasbot
-  update:
-    strategy: Setters
-    path: services/atlasbot
--- a/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
@ -1,17 +0,0 @@
-# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: atlasbot
-  namespace: flux-system
-spec:
-  interval: 10m
-  prune: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-  path: ./services/atlasbot
-  targetNamespace: ai
-  timeout: 2m
-  dependsOn:
-    - name: ai-llm
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/atlasbot
+        branch: feature/ariadne
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(bstein-dev-home): automated image update"
    push:
-      branch: feature/atlasbot
+      branch: feature/ariadne
  update:
    strategy: Setters
    path: services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/comms/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/comms/image-automation.yaml
@ -1,26 +0,0 @@
-# clusters/atlas/flux-system/applications/comms/image-automation.yaml
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImageUpdateAutomation
-metadata:
-  name: comms
-  namespace: comms
-spec:
-  interval: 1m0s
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  git:
-    checkout:
-      ref:
-        branch: feature/atlasbot
-    commit:
-      author:
-        email: ops@bstein.dev
-        name: flux-bot
-      messageTemplate: "chore(comms): automated image update"
-    push:
-      branch: feature/atlasbot
-  update:
-    strategy: Setters
-    path: services/comms
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -6,9 +6,6 @@ resources:
  - vault/kustomization.yaml
  - vaultwarden/kustomization.yaml
  - comms/kustomization.yaml
-  - comms/image-automation.yaml
-  - atlasbot/kustomization.yaml
-  - atlasbot/image-automation.yaml
  - crypto/kustomization.yaml
  - monerod/kustomization.yaml
  - pegasus/kustomization.yaml
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@ -9,7 +9,7 @@ metadata:
 spec:
  interval: 1m0s
  ref:
-    branch: feature/atlasbot
+    branch: feature/ariadne
  secretRef:
    name: flux-system-gitea
  url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@ -16,6 +16,5 @@ resources:
  - longhorn/kustomization.yaml
  - longhorn-ui/kustomization.yaml
  - postgres/kustomization.yaml
-  - nats/kustomization.yaml
  - ../platform/vault-csi/kustomization.yaml
  - ../platform/vault-injector/kustomization.yaml
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/atlasbot
+        branch: feature/ariadne
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(maintenance): automated image update"
    push:
-      branch: feature/atlasbot
+      branch: feature/ariadne
  update:
    strategy: Setters
    path: services/maintenance
--- a/clusters/atlas/flux-system/platform/nats/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/nats/kustomization.yaml
@ -1,21 +0,0 @@
-# clusters/atlas/flux-system/platform/nats/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: nats
-  namespace: flux-system
-spec:
-  interval: 10m
-  path: ./infrastructure/nats
-  prune: true
-  force: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-  targetNamespace: nats
-  healthChecks:
-    - apiVersion: apps/v1
-      kind: StatefulSet
-      name: nats
-      namespace: nats
-  wait: true
--- a/dockerfiles/Dockerfile.synapse-admin-ensure
+++ b/dockerfiles/Dockerfile.synapse-admin-ensure
@ -1,3 +0,0 @@
-FROM python:3.11-slim
-
-RUN pip install --no-cache-dir psycopg2-binary bcrypt
--- a/infrastructure/core/kustomization.yaml
+++ b/infrastructure/core/kustomization.yaml
@ -6,7 +6,6 @@ resources:
  - ../modules/profiles/atlas-ha
  - coredns-custom.yaml
  - coredns-deployment.yaml
-  - longhorn-node-taints.yaml
  - ntp-sync-daemonset.yaml
  - ../sources/cert-manager/letsencrypt.yaml
  - ../sources/cert-manager/letsencrypt-prod.yaml
--- a/infrastructure/core/longhorn-node-taints.yaml
+++ b/infrastructure/core/longhorn-node-taints.yaml
@ -1,40 +0,0 @@
-# infrastructure/core/longhorn-node-taints.yaml
-apiVersion: v1
-kind: Node
-metadata:
-  name: titan-13
-spec:
-  taints:
-    - key: longhorn
-      value: "true"
-      effect: PreferNoSchedule
---
-apiVersion: v1
-kind: Node
-metadata:
-  name: titan-15
-spec:
-  taints:
-    - key: longhorn
-      value: "true"
-      effect: PreferNoSchedule
---
-apiVersion: v1
-kind: Node
-metadata:
-  name: titan-17
-spec:
-  taints:
-    - key: longhorn
-      value: "true"
-      effect: PreferNoSchedule
---
-apiVersion: v1
-kind: Node
-metadata:
-  name: titan-19
-spec:
-  taints:
-    - key: longhorn
-      value: "true"
-      effect: PreferNoSchedule
--- a/infrastructure/longhorn/core/backup-target.yaml
+++ b/infrastructure/longhorn/core/backup-target.yaml
@ -1,10 +0,0 @@
-# infrastructure/longhorn/core/backup-target.yaml
-apiVersion: longhorn.io/v1beta2
-kind: BackupTarget
-metadata:
-  name: default
-  namespace: longhorn-system
-spec:
-  backupTargetURL: "s3://atlas-soteria@us-west-004/"
-  credentialSecret: longhorn-backup-b2
-  pollInterval: 5m0s
--- a/infrastructure/longhorn/core/kustomization.yaml
+++ b/infrastructure/longhorn/core/kustomization.yaml
@ -6,7 +6,6 @@ resources:
  - vault-serviceaccount.yaml
  - secretproviderclass.yaml
  - vault-sync-deployment.yaml
-  - backup-target.yaml
  - helmrelease.yaml
  - longhorn-settings-ensure-job.yaml

--- a/infrastructure/longhorn/core/secretproviderclass.yaml
+++ b/infrastructure/longhorn/core/secretproviderclass.yaml
@ -13,27 +13,9 @@ spec:
      - objectName: "harbor-pull__dockerconfigjson"
        secretPath: "kv/data/atlas/shared/harbor-pull"
        secretKey: "dockerconfigjson"
-      - objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
-        secretPath: "kv/data/atlas/longhorn/backup-b2"
-        secretKey: "AWS_ACCESS_KEY_ID"
-      - objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
-        secretPath: "kv/data/atlas/longhorn/backup-b2"
-        secretKey: "AWS_SECRET_ACCESS_KEY"
-      - objectName: "longhorn_backup__AWS_ENDPOINTS"
-        secretPath: "kv/data/atlas/longhorn/backup-b2"
-        secretKey: "AWS_ENDPOINTS"
  secretObjects:
    - secretName: longhorn-registry
      type: kubernetes.io/dockerconfigjson
      data:
        - objectName: harbor-pull__dockerconfigjson
          key: .dockerconfigjson
-    - secretName: longhorn-backup-b2
-      type: Opaque
-      data:
-        - objectName: longhorn_backup__AWS_ACCESS_KEY_ID
-          key: AWS_ACCESS_KEY_ID
-        - objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
-          key: AWS_SECRET_ACCESS_KEY
-        - objectName: longhorn_backup__AWS_ENDPOINTS
-          key: AWS_ENDPOINTS
--- a/infrastructure/nats/configmap.yaml
+++ b/infrastructure/nats/configmap.yaml
@ -1,17 +0,0 @@
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: nats-config
-  namespace: nats
-  labels:
-    app: nats
-    component: config
-  annotations:
-    description: "NATS JetStream configuration"
-data:
-  nats.conf: |
-    jetstream {
-      store_dir: /data
-      max_mem_store: 128MB
-      max_file_store: 1GB
-    }
--- a/infrastructure/nats/kustomization.yaml
+++ b/infrastructure/nats/kustomization.yaml
@ -1,7 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - configmap.yaml
-  - service.yaml
-  - statefulset.yaml
--- a/infrastructure/nats/namespace.yaml
+++ b/infrastructure/nats/namespace.yaml
@ -1,4 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: nats
--- a/infrastructure/nats/service.yaml
+++ b/infrastructure/nats/service.yaml
@ -1,17 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: nats
-  namespace: nats
-  labels:
-    app: nats
-spec:
-  selector:
-    app: nats
-  ports:
-    - name: client
-      port: 4222
-      targetPort: 4222
-    - name: monitoring
-      port: 8222
-      targetPort: 8222
--- a/infrastructure/nats/statefulset.yaml
+++ b/infrastructure/nats/statefulset.yaml
@ -1,54 +0,0 @@
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
-  name: nats
-  namespace: nats
-  labels:
-    app: nats
-spec:
-  serviceName: nats
-  replicas: 1
-  selector:
-    matchLabels:
-      app: nats
-  template:
-    metadata:
-      labels:
-        app: nats
-    spec:
-      containers:
-        - name: nats
-          image: nats:2.10.18
-          args:
-            - "-c"
-            - "/etc/nats/nats.conf"
-          ports:
-            - name: client
-              containerPort: 4222
-            - name: monitoring
-              containerPort: 8222
-          volumeMounts:
-            - name: config
-              mountPath: /etc/nats
-            - name: data
-              mountPath: /data
-          resources:
-            requests:
-              cpu: 100m
-              memory: 256Mi
-            limits:
-              cpu: 500m
-              memory: 512Mi
-      volumes:
-        - name: config
-          configMap:
-            name: nats-config
-  volumeClaimTemplates:
-    - metadata:
-        name: data
-      spec:
-        accessModes:
-          - ReadWriteOnce
-        resources:
-          requests:
-            storage: 2Gi
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -47,7 +47,6 @@ PERCENT_THRESHOLDS = {
 }

 NAMESPACE_CPU_WINDOW = "1m"
-GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"

 # ---------------------------------------------------------------------------
 # Cluster metadata
@ -236,16 +235,13 @@ def gpu_util_by_hostname():


 def gpu_node_labels():
-    return (
-        f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
-        ' or kube_node_labels{label_jetson="true"}'
-    )
+    return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'


 def gpu_requests_by_namespace_node(scope_var):
    return (
        "sum by (namespace,node) ("
-        f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
+        f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
        "* on(namespace,pod) group_left(node) kube_pod_info "
        f"* on(node) group_left() ({gpu_node_labels()})"
        ")"
@ -257,7 +253,7 @@ def gpu_usage_by_namespace(scope_var):
    total_by_node = f"sum by (node) ({requests_by_ns})"
    return (
        "sum by (namespace) ("
-        f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
+        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
        f"* on(node) group_left() ({gpu_util_by_node()})"
        ")"
    )
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@ -539,9 +539,9 @@ def main() -> int:
        help="Write generated files (otherwise just print a summary).",
    )
    ap.add_argument(
-        "--sync-atlasbot",
+        "--sync-comms",
        action="store_true",
-        help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
+        help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
    )
    args = ap.parse_args()

@ -632,10 +632,10 @@ def main() -> int:
    print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")

-    if args.sync_atlasbot:
-        atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
-        _sync_tree(out_dir, atlasbot_dir)
-        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
+    if args.sync_comms:
+        comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
+        _sync_tree(out_dir, comms_dir)
+        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
    return 0


--- a/services/atlasbot/image-automation.yaml
+++ b/services/atlasbot/image-automation.yaml
@ -1,26 +0,0 @@
-# services/atlasbot/image-automation.yaml
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImageUpdateAutomation
-metadata:
-  name: atlasbot
-  namespace: ai
-spec:
-  interval: 1m0s
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  git:
-    checkout:
-      ref:
-        branch: feature/atlasbot
-    commit:
-      author:
-        name: flux-bot
-        email: ops@bstein.dev
-      messageTemplate: "chore(atlasbot): automated image update"
-    push:
-      branch: feature/atlasbot
-  update:
-    path: services/atlasbot
-    strategy: Setters
--- a/services/atlasbot/image.yaml
+++ b/services/atlasbot/image.yaml
@ -1,23 +0,0 @@
-# services/comms/image.yaml
-apiVersion: image.toolkit.fluxcd.io/v1beta2
-kind: ImageRepository
-metadata:
-  name: atlasbot
-  namespace: ai
-spec:
-  image: registry.bstein.dev/bstein/atlasbot
-  interval: 1m0s
-  secretRef:
-    name: harbor-regcred
---
-apiVersion: image.toolkit.fluxcd.io/v1beta2
-kind: ImagePolicy
-metadata:
-  name: atlasbot
-  namespace: ai
-spec:
-  imageRepositoryRef:
-    name: atlasbot
-  policy:
-    semver:
-      range: ">=0.1.0-0"
--- a/services/atlasbot/knowledge/INDEX.md
+++ b/services/atlasbot/knowledge/INDEX.md
@ -1,22 +0,0 @@
-Atlas Knowledge Base (KB)
-
-This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
- Accurate (grounded in GitOps + read-only cluster tools)
- Maintainable (small docs + deterministic generators)
- Safe (no secrets; refer to Secret/Vault paths by name only)
-
-Layout
- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
-
-Regeneration
- Update manifests/docs, then regenerate generated artifacts:
-  - `python scripts/knowledge_render_atlas.py --write`
-
-Authoring rules
- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
- Keep each runbook small; one topic per file; use headings.
- When in doubt, link to the exact file path in this repo that configures the behavior.
-
--- a/services/atlasbot/knowledge/catalog/atlas-summary.json
+++ b/services/atlasbot/knowledge/catalog/atlas-summary.json
@ -1,8 +0,0 @@
-{
-  "counts": {
-    "helmrelease_host_hints": 19,
-    "http_endpoints": 45,
-    "services": 47,
-    "workloads": 74
-  }
-}
--- a/services/atlasbot/knowledge/catalog/atlas.json
+++ b/services/atlasbot/knowledge/catalog/atlas.json
--- a/services/atlasbot/knowledge/catalog/metrics.json
+++ b/services/atlasbot/knowledge/catalog/metrics.json
--- a/services/atlasbot/knowledge/catalog/runbooks.json
+++ b/services/atlasbot/knowledge/catalog/runbooks.json
--- a/services/atlasbot/knowledge/diagrams/atlas-http.mmd
+++ b/services/atlasbot/knowledge/diagrams/atlas-http.mmd
@ -1,234 +0,0 @@
-flowchart LR
-  host_auth_bstein_dev["auth.bstein.dev"]
-  svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
-  host_auth_bstein_dev --> svc_sso_oauth2_proxy
-  wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
-  svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
-  host_bstein_dev["bstein.dev"]
-  svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
-  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
-  wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
-  svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
-  svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
-  host_bstein_dev --> svc_comms_matrix_wellknown
-  wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
-  svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
-  svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
-  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
-  wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
-  svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
-  host_budget_bstein_dev["budget.bstein.dev"]
-  svc_finance_actual_budget["finance/actual-budget (Service)"]
-  host_budget_bstein_dev --> svc_finance_actual_budget
-  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
-  svc_finance_actual_budget --> wl_finance_actual_budget
-  host_call_live_bstein_dev["call.live.bstein.dev"]
-  svc_comms_element_call["comms/element-call (Service)"]
-  host_call_live_bstein_dev --> svc_comms_element_call
-  wl_comms_element_call["comms/element-call (Deployment)"]
-  svc_comms_element_call --> wl_comms_element_call
-  host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
-  svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
-  host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
-  wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
-  svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
-  host_ci_bstein_dev["ci.bstein.dev"]
-  svc_jenkins_jenkins["jenkins/jenkins (Service)"]
-  host_ci_bstein_dev --> svc_jenkins_jenkins
-  wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
-  svc_jenkins_jenkins --> wl_jenkins_jenkins
-  host_cloud_bstein_dev["cloud.bstein.dev"]
-  svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
-  host_cloud_bstein_dev --> svc_nextcloud_nextcloud
-  wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
-  svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
-  host_health_bstein_dev["health.bstein.dev"]
-  svc_health_wger["health/wger (Service)"]
-  host_health_bstein_dev --> svc_health_wger
-  wl_health_wger["health/wger (Deployment)"]
-  svc_health_wger --> wl_health_wger
-  host_kit_live_bstein_dev["kit.live.bstein.dev"]
-  svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
-  host_kit_live_bstein_dev --> svc_comms_livekit_token_service
-  wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
-  svc_comms_livekit_token_service --> wl_comms_livekit_token_service
-  svc_comms_livekit["comms/livekit (Service)"]
-  host_kit_live_bstein_dev --> svc_comms_livekit
-  wl_comms_livekit["comms/livekit (Deployment)"]
-  svc_comms_livekit --> wl_comms_livekit
-  host_live_bstein_dev["live.bstein.dev"]
-  host_live_bstein_dev --> svc_comms_matrix_wellknown
-  svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
-  host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
-  host_live_bstein_dev --> svc_comms_matrix_guest_register
-  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
-  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
-  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
-  host_live_bstein_dev --> svc_comms_matrix_authentication_service
-  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
-  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
-  host_logs_bstein_dev["logs.bstein.dev"]
-  svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
-  host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
-  wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
-  svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
-  host_longhorn_bstein_dev["longhorn.bstein.dev"]
-  svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
-  host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
-  wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
-  svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
-  host_mail_bstein_dev["mail.bstein.dev"]
-  svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
-  host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
-  host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
-  host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
-  host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
-  host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
-  host_monero_bstein_dev["monero.bstein.dev"]
-  svc_crypto_monerod["crypto/monerod (Service)"]
-  host_monero_bstein_dev --> svc_crypto_monerod
-  wl_crypto_monerod["crypto/monerod (Deployment)"]
-  svc_crypto_monerod --> wl_crypto_monerod
-  host_money_bstein_dev["money.bstein.dev"]
-  svc_finance_firefly["finance/firefly (Service)"]
-  host_money_bstein_dev --> svc_finance_firefly
-  wl_finance_firefly["finance/firefly (Deployment)"]
-  svc_finance_firefly --> wl_finance_firefly
-  host_notes_bstein_dev["notes.bstein.dev"]
-  svc_outline_outline["outline/outline (Service)"]
-  host_notes_bstein_dev --> svc_outline_outline
-  wl_outline_outline["outline/outline (Deployment)"]
-  svc_outline_outline --> wl_outline_outline
-  host_office_bstein_dev["office.bstein.dev"]
-  svc_nextcloud_collabora["nextcloud/collabora (Service)"]
-  host_office_bstein_dev --> svc_nextcloud_collabora
-  wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
-  svc_nextcloud_collabora --> wl_nextcloud_collabora
-  host_pegasus_bstein_dev["pegasus.bstein.dev"]
-  svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
-  host_pegasus_bstein_dev --> svc_jellyfin_pegasus
-  wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
-  svc_jellyfin_pegasus --> wl_jellyfin_pegasus
-  host_scm_bstein_dev["scm.bstein.dev"]
-  svc_gitea_gitea["gitea/gitea (Service)"]
-  host_scm_bstein_dev --> svc_gitea_gitea
-  wl_gitea_gitea["gitea/gitea (Deployment)"]
-  svc_gitea_gitea --> wl_gitea_gitea
-  host_secret_bstein_dev["secret.bstein.dev"]
-  svc_vault_vault["vault/vault (Service)"]
-  host_secret_bstein_dev --> svc_vault_vault
-  wl_vault_vault["vault/vault (StatefulSet)"]
-  svc_vault_vault --> wl_vault_vault
-  host_sso_bstein_dev["sso.bstein.dev"]
-  svc_sso_keycloak["sso/keycloak (Service)"]
-  host_sso_bstein_dev --> svc_sso_keycloak
-  wl_sso_keycloak["sso/keycloak (Deployment)"]
-  svc_sso_keycloak --> wl_sso_keycloak
-  host_stream_bstein_dev["stream.bstein.dev"]
-  svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
-  host_stream_bstein_dev --> svc_jellyfin_jellyfin
-  wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
-  svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
-  host_tasks_bstein_dev["tasks.bstein.dev"]
-  svc_planka_planka["planka/planka (Service)"]
-  host_tasks_bstein_dev --> svc_planka_planka
-  wl_planka_planka["planka/planka (Deployment)"]
-  svc_planka_planka --> wl_planka_planka
-  host_vault_bstein_dev["vault.bstein.dev"]
-  svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
-  host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
-  wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
-  svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
-
-  subgraph bstein_dev_home[bstein-dev-home]
-    svc_bstein_dev_home_bstein_dev_home_frontend
-    wl_bstein_dev_home_bstein_dev_home_frontend
-    svc_bstein_dev_home_bstein_dev_home_backend
-    wl_bstein_dev_home_bstein_dev_home_backend
-    svc_bstein_dev_home_chat_ai_gateway
-    wl_bstein_dev_home_chat_ai_gateway
-  end
-  subgraph comms[comms]
-    svc_comms_matrix_wellknown
-    wl_comms_matrix_wellknown
-    svc_comms_element_call
-    wl_comms_element_call
-    svc_comms_livekit_token_service
-    wl_comms_livekit_token_service
-    svc_comms_livekit
-    wl_comms_livekit
-    svc_comms_othrys_synapse_matrix_synapse
-    svc_comms_matrix_guest_register
-    wl_comms_matrix_guest_register
-    svc_comms_matrix_authentication_service
-    wl_comms_matrix_authentication_service
-  end
-  subgraph crypto[crypto]
-    svc_crypto_monerod
-    wl_crypto_monerod
-  end
-  subgraph finance[finance]
-    svc_finance_actual_budget
-    wl_finance_actual_budget
-    svc_finance_firefly
-    wl_finance_firefly
-  end
-  subgraph gitea[gitea]
-    svc_gitea_gitea
-    wl_gitea_gitea
-  end
-  subgraph health[health]
-    svc_health_wger
-    wl_health_wger
-  end
-  subgraph jellyfin[jellyfin]
-    svc_jellyfin_pegasus
-    wl_jellyfin_pegasus
-    svc_jellyfin_jellyfin
-    wl_jellyfin_jellyfin
-  end
-  subgraph jenkins[jenkins]
-    svc_jenkins_jenkins
-    wl_jenkins_jenkins
-  end
-  subgraph logging[logging]
-    svc_logging_oauth2_proxy_logs
-    wl_logging_oauth2_proxy_logs
-  end
-  subgraph longhorn_system[longhorn-system]
-    svc_longhorn_system_oauth2_proxy_longhorn
-    wl_longhorn_system_oauth2_proxy_longhorn
-  end
-  subgraph mailu_mailserver[mailu-mailserver]
-    svc_mailu_mailserver_mailu_front
-  end
-  subgraph nextcloud[nextcloud]
-    svc_nextcloud_nextcloud
-    wl_nextcloud_nextcloud
-    svc_nextcloud_collabora
-    wl_nextcloud_collabora
-  end
-  subgraph outline[outline]
-    svc_outline_outline
-    wl_outline_outline
-  end
-  subgraph planka[planka]
-    svc_planka_planka
-    wl_planka_planka
-  end
-  subgraph sso[sso]
-    svc_sso_oauth2_proxy
-    wl_sso_oauth2_proxy
-    svc_sso_keycloak
-    wl_sso_keycloak
-  end
-  subgraph vault[vault]
-    svc_vault_vault
-    wl_vault_vault
-  end
-  subgraph vaultwarden[vaultwarden]
-    svc_vaultwarden_vaultwarden_service
-    wl_vaultwarden_vaultwarden
-  end
--- a/services/atlasbot/kustomization.yaml
+++ b/services/atlasbot/kustomization.yaml
@ -1,29 +0,0 @@
-# services/atlasbot/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-namespace: ai
-resources:
-  - atlasbot-deployment.yaml
-  - atlasbot-service.yaml
-  - atlasbot-rbac.yaml
-  - secretproviderclass.yaml
-  - vault-sync-deployment.yaml
-  - image.yaml
-  - image-automation.yaml
-images:
-  - name: registry.bstein.dev/bstein/atlasbot
-    newTag: 0.1.2-97 # {"$imagepolicy": "ai:atlasbot:tag"}
-configMapGenerator:
-  - name: atlasbot-vault-env
-    files:
-      - atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
-    options:
-      disableNameSuffixHash: true
-  - name: atlas-kb
-    files:
-      - INDEX.md=knowledge/INDEX.md
-      - atlas.json=knowledge/catalog/atlas.json
-      - atlas-summary.json=knowledge/catalog/atlas-summary.json
-      - metrics.json=knowledge/catalog/metrics.json
-      - runbooks.json=knowledge/catalog/runbooks.json
-      - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
--- a/services/atlasbot/scripts/atlasbot_vault_env.sh
+++ b/services/atlasbot/scripts/atlasbot_vault_env.sh
@ -1,44 +0,0 @@
-#!/usr/bin/env sh
-set -eu
-
-vault_dir="/vault/secrets"
-
-read_secret() {
-  tr -d '\r\n' < "${vault_dir}/$1"
-}
-
-read_optional() {
-  if [ -f "${vault_dir}/$1" ]; then
-    tr -d '\r\n' < "${vault_dir}/$1"
-  else
-    printf ''
-  fi
-}
-
-export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
-export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
-
-export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
-export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
-
-export BOT_PASS="$(read_secret bot-pass)"
-export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
-export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
-export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
-if [ -z "${BOT_PASS_SMART}" ]; then
-  export BOT_PASS_SMART="${BOT_PASS}"
-fi
-if [ -z "${BOT_PASS_GENIUS}" ]; then
-  export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
-fi
-export SEEDER_PASS="$(read_secret seeder-pass)"
-
-export CHAT_API_KEY="$(read_secret chat-matrix)"
-export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
-
-export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
-export PGPASSWORD="$(read_secret synapse-db-pass)"
-
-export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
-export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
-export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"
--- a/services/atlasbot/secretproviderclass.yaml
+++ b/services/atlasbot/secretproviderclass.yaml
@ -1,21 +0,0 @@
-# services/atlasbot/secretproviderclass.yaml
-apiVersion: secrets-store.csi.x-k8s.io/v1
-kind: SecretProviderClass
-metadata:
-  name: atlasbot-vault
-  namespace: ai
-spec:
-  provider: vault
-  parameters:
-    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
-    roleName: "ai"
-    objects: |
-      - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/shared/harbor-pull"
-        secretKey: "dockerconfigjson"
-  secretObjects:
-    - secretName: harbor-regcred
-      type: kubernetes.io/dockerconfigjson
-      data:
-        - objectName: harbor-pull__dockerconfigjson
-          key: .dockerconfigjson
--- a/services/atlasbot/vault-sync-deployment.yaml
+++ b/services/atlasbot/vault-sync-deployment.yaml
@ -1,34 +0,0 @@
-# services/atlasbot/vault-sync-deployment.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: atlasbot-vault-sync
-  namespace: ai
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: atlasbot-vault-sync
-  template:
-    metadata:
-      labels:
-        app: atlasbot-vault-sync
-    spec:
-      serviceAccountName: atlasbot
-      containers:
-        - name: sync
-          image: alpine:3.20
-          command: ["/bin/sh", "-c"]
-          args:
-            - "sleep infinity"
-          volumeMounts:
-            - name: vault-secrets
-              mountPath: /vault/secrets
-              readOnly: true
-      volumes:
-        - name: vault-secrets
-          csi:
-            driver: secrets-store.csi.k8s.io
-            readOnly: true
-            volumeAttributes:
-              secretProviderClass: atlasbot-vault
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -68,11 +68,7 @@ spec:
            - name: AI_CHAT_TIMEOUT_SEC
              value: "480"
            - name: AI_ATLASBOT_ENDPOINT
-              value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
-            - name: AI_ATLASBOT_MODEL_FAST
-              value: qwen2.5:14b-instruct-q4_0
-            - name: AI_ATLASBOT_MODEL_SMART
-              value: qwen2.5:14b-instruct
+              value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
            - name: AI_ATLASBOT_TIMEOUT_SEC
              value: "30"
            - name: AI_NODE_NAME
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@ -20,9 +20,9 @@ resources:
  - ingress.yaml
 images:
  - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
  - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-119 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
  - name: chat-ai-gateway
    namespace: bstein-dev-home
--- a/services/atlasbot/atlasbot-deployment.yaml
+++ b/services/atlasbot/atlasbot-deployment.yaml
@ -3,7 +3,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: atlasbot
-  namespace: ai
+  namespace: comms
  labels:
    app: atlasbot
 spec:
@ -18,7 +18,7 @@ spec:
      annotations:
        checksum/atlasbot-configmap: manual-atlasbot-101
        vault.hashicorp.com/agent-inject: "true"
-        vault.hashicorp.com/role: "ai"
+        vault.hashicorp.com/role: "comms"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
        vault.hashicorp.com/agent-inject-template-turn-secret: |
          {{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -28,15 +28,6 @@ spec:
        vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
-        vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
-          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
-        vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
-          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
-        vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
-          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-seeder-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -67,17 +58,17 @@ spec:
        hardware: rpi5
      containers:
        - name: atlasbot
-          image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
+          image: python:3.11-slim
          command: ["/bin/sh","-c"]
          args:
            - |
-              . /vault/scripts/atlasbot_vault_env.sh
-              exec python -m atlasbot.main
+              . /vault/scripts/comms_vault_env.sh
+              exec python /app/bot.py
          env:
            - name: MATRIX_BASE
-              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
+              value: http://othrys-synapse-matrix-synapse:8008
            - name: AUTH_BASE
-              value: http://matrix-authentication-service.comms.svc.cluster.local:8080
+              value: http://matrix-authentication-service:8080
            - name: KB_DIR
              value: /kb
            - name: VM_URL
@ -85,61 +76,27 @@ spec:
            - name: ARIADNE_STATE_URL
              value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
            - name: BOT_USER
-              value: atlas-smart
-            - name: BOT_USER_QUICK
-              value: atlas-quick
-            - name: BOT_USER_SMART
-              value: atlas-smart
-            - name: BOT_USER_GENIUS
-              value: atlas-genius
+              value: atlasbot
            - name: BOT_MENTIONS
-              value: atlas-quick,atlas-smart,atlas-genius
+              value: atlasbot,aatlasbot,atlas_quick,atlas_smart
            - name: OLLAMA_URL
              value: http://ollama.ai.svc.cluster.local:11434
            - name: OLLAMA_MODEL
-              value: qwen2.5:14b-instruct-q4_0
+              value: qwen2.5:14b-instruct
            - name: ATLASBOT_MODEL_FAST
              value: qwen2.5:14b-instruct-q4_0
-            - name: ATLASBOT_MODEL_SMART
-              value: qwen2.5:14b-instruct-q4_0
-            - name: ATLASBOT_MODEL_GENIUS
-              value: qwen2.5:14b-instruct-q4_0
+            - name: ATLASBOT_MODEL_DEEP
+              value: qwen2.5:14b-instruct
            - name: OLLAMA_FALLBACK_MODEL
              value: qwen2.5:14b-instruct-q4_0
            - name: OLLAMA_TIMEOUT_SEC
              value: "600"
            - name: ATLASBOT_THINKING_INTERVAL_SEC
-              value: "30"
+              value: "120"
            - name: ATLASBOT_SNAPSHOT_TTL_SEC
              value: "30"
            - name: ATLASBOT_HTTP_PORT
              value: "8090"
-            - name: ATLASBOT_STATE_DB
-              value: /data/atlasbot_state.db
-            - name: ATLASBOT_QUEUE_ENABLED
-              value: "false"
-            - name: ATLASBOT_DEBUG_PIPELINE
-              value: "true"
-            - name: ATLASBOT_NATS_URL
-              value: nats://nats.nats.svc.cluster.local:4222
-            - name: ATLASBOT_NATS_STREAM
-              value: atlasbot
-            - name: ATLASBOT_NATS_SUBJECT
-              value: atlasbot.requests
-            - name: ATLASBOT_FAST_MAX_ANGLES
-              value: "2"
-            - name: ATLASBOT_SMART_MAX_ANGLES
-              value: "5"
-            - name: ATLASBOT_FAST_MAX_CANDIDATES
-              value: "2"
-            - name: ATLASBOT_SMART_MAX_CANDIDATES
-              value: "6"
-            - name: ATLASBOT_FAST_LLM_CALLS_MAX
-              value: "24"
-            - name: ATLASBOT_SMART_LLM_CALLS_MAX
-              value: "48"
-            - name: ATLASBOT_GENIUS_LLM_CALLS_MAX
-              value: "96"
          ports:
            - name: http
              containerPort: 8090
@ -151,15 +108,19 @@ spec:
              cpu: 500m
              memory: 512Mi
          volumeMounts:
+            - name: code
+              mountPath: /app/bot.py
+              subPath: bot.py
            - name: kb
              mountPath: /kb
              readOnly: true
            - name: vault-scripts
              mountPath: /vault/scripts
              readOnly: true
-            - name: atlasbot-state
-              mountPath: /data
      volumes:
+        - name: code
+          configMap:
+            name: atlasbot
        - name: kb
          configMap:
            name: atlas-kb
@ -178,7 +139,5 @@ spec:
                path: diagrams/atlas-http.mmd
        - name: vault-scripts
          configMap:
-            name: atlasbot-vault-env
+            name: comms-vault-env
            defaultMode: 0555
-        - name: atlasbot-state
-          emptyDir: {}
--- a/services/atlasbot/atlasbot-rbac.yaml
+++ b/services/atlasbot/atlasbot-rbac.yaml
@ -3,9 +3,7 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: atlasbot
-  namespace: ai
-imagePullSecrets:
-  - name: harbor-regcred
+  namespace: comms
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
@ -45,4 +43,5 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: atlasbot
-    namespace: ai
+    namespace: comms
+
--- a/services/atlasbot/atlasbot-service.yaml
+++ b/services/atlasbot/atlasbot-service.yaml
@ -2,7 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
  name: atlasbot
-  namespace: ai
+  namespace: comms
  labels:
    app: atlasbot
 spec:
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@ -13,7 +13,10 @@ resources:
  - element-call-deployment.yaml
  - guest-register-deployment.yaml
  - guest-register-service.yaml
+  - atlasbot-deployment.yaml
+  - atlasbot-service.yaml
  - wellknown.yaml
+  - atlasbot-rbac.yaml
  - mas-secrets-ensure-rbac.yaml
  - comms-secrets-ensure-rbac.yaml
  - mas-db-ensure-rbac.yaml
@ -40,6 +43,7 @@ resources:
  - livekit-ingress.yaml
  - livekit-middlewares.yaml
  - matrix-ingress.yaml
+
 configMapGenerator:
  - name: comms-vault-env
    files:
@ -56,8 +60,21 @@ configMapGenerator:
      - server.py=scripts/guest-register/server.py
    options:
      disableNameSuffixHash: true
+  - name: atlasbot
+    files:
+      - bot.py=scripts/atlasbot/bot.py
+    options:
+      disableNameSuffixHash: true
  - name: othrys-element-host-config
    files:
      - 20-host-config.sh=scripts/element-host-config.sh
    options:
      disableNameSuffixHash: true
+  - name: atlas-kb
+    files:
+      - INDEX.md=knowledge/INDEX.md
+      - atlas.json=knowledge/catalog/atlas.json
+      - atlas-summary.json=knowledge/catalog/atlas-summary.json
+      - metrics.json=knowledge/catalog/metrics.json
+      - runbooks.json=knowledge/catalog/runbooks.json
+      - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
--- a/services/comms/oneoffs/comms-secrets-ensure-job.yaml
+++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
@ -1,12 +1,12 @@
 # services/comms/oneoffs/comms-secrets-ensure-job.yaml
-# One-off job for comms/comms-secrets-ensure-8.
-# Purpose: comms secrets ensure 8 (see container args/env in this file).
+# One-off job for comms/comms-secrets-ensure-7.
+# Purpose: comms secrets ensure 7 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: comms-secrets-ensure-8
+  name: comms-secrets-ensure-7
  namespace: comms
 spec:
  suspend: true
@ -87,9 +87,6 @@ spec:
              ensure_key "comms/synapse-redis" "redis-password" >/dev/null
              ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
-              ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
-              ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
-              ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null

              SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"
--- a/services/comms/oneoffs/mas-local-users-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
@ -1,12 +1,12 @@
 # services/comms/oneoffs/mas-local-users-ensure-job.yaml
-# One-off job for comms/mas-local-users-ensure-19.
+# One-off job for comms/mas-local-users-ensure-18.
 # Purpose: mas local users ensure 18 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-19
+  name: mas-local-users-ensure-18
  namespace: comms
 spec:
  suspend: true
@ -27,12 +27,6 @@ spec:
        vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
-        vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
-          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
-        vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
-        vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
-          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-seeder-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -98,13 +92,7 @@ spec:
            - name: SEEDER_USER
              value: othrys-seeder
            - name: BOT_USER
-              value: atlas-smart
-            - name: BOT_USER_QUICK
-              value: atlas-quick
-            - name: BOT_USER_SMART
-              value: atlas-smart
-            - name: BOT_USER_GENIUS
-              value: atlas-genius
+              value: atlasbot
          command:
            - /bin/sh
            - -c
@ -237,27 +225,11 @@ spec:
                      },
                      timeout=30,
                  )
-                  if r.status_code == 429:
-                      return False
                  if r.status_code != 200:
                      raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
-                  return True

              wait_for_service(MAS_ADMIN_API_BASE)
              token = admin_token()
-              bot_quick = os.environ.get("BOT_USER_QUICK", "")
-              bot_smart = os.environ.get("BOT_USER_SMART", "")
-              bot_genius = os.environ.get("BOT_USER_GENIUS", "")
-              bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
-              bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
-              bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
-
              ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
              ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
-              if bot_quick and bot_quick_pass:
-                  ensure_user(token, bot_quick, bot_quick_pass)
-              if bot_smart and bot_smart_pass:
-                  ensure_user(token, bot_smart, bot_smart_pass)
-              if bot_genius and bot_genius_pass:
-                  ensure_user(token, bot_genius, bot_genius_pass)
              PY
--- a/services/comms/oneoffs/synapse-admin-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
@ -1,15 +1,15 @@
 # services/comms/oneoffs/synapse-admin-ensure-job.yaml
-# One-off job for comms/synapse-admin-ensure-15.
-# Purpose: synapse admin ensure 15 (see container args/env in this file).
+# One-off job for comms/synapse-admin-ensure-3.
+# Purpose: synapse admin ensure 3 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-admin-ensure-15
+  name: synapse-admin-ensure-3
  namespace: comms
 spec:
-  suspend: false
+  suspend: true
  backoffLimit: 0
  ttlSecondsAfterFinished: 3600
  template:
@ -32,8 +32,7 @@ spec:
                    values: ["arm64"]
      containers:
        - name: ensure
-          image: python:3.12-slim
-          imagePullPolicy: Always
+          image: python:3.11-slim
          env:
            - name: VAULT_ADDR
              value: http://vault.vault.svc.cluster.local:8200
@ -46,20 +45,22 @@ spec:
            - -c
            - |
              set -euo pipefail
-              python -m pip install --no-cache-dir psycopg2-binary
+              pip install --no-cache-dir psycopg2-binary bcrypt
              python - <<'PY'
              import json
              import os
+              import secrets
+              import string
+              import time
              import urllib.error
-              import urllib.parse
              import urllib.request

+              import bcrypt
              import psycopg2

              VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
              VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
              SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
-              SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
              PGHOST = "postgres-service.postgres.svc.cluster.local"
              PGPORT = 5432
              PGDATABASE = "synapse"
@ -112,15 +113,48 @@ spec:
                  with urllib.request.urlopen(req, timeout=30) as resp:
                      resp.read()

+              def random_password(length: int = 32) -> str:
+                  alphabet = string.ascii_letters + string.digits
+                  return "".join(secrets.choice(alphabet) for _ in range(length))
+
              def ensure_admin_creds(token: str) -> dict:
                  data = vault_get(token, "comms/synapse-admin")
-                  username = "othrys-seeder"
-                  if data.get("username") != username:
-                      data["username"] = username
-                      data.pop("access_token", None)
+                  username = (data.get("username") or "").strip() or "synapse-admin"
+                  password = (data.get("password") or "").strip()
+                  if not password:
+                      password = random_password()
+                  data["username"] = username
+                  data["password"] = password
                  vault_put(token, "comms/synapse-admin", data)
                  return data

+              def ensure_user(cur, cols, user_id, password, admin):
+                  now_ms = int(time.time() * 1000)
+                  values = {
+                      "name": user_id,
+                      "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
+                      "creation_ts": now_ms,
+                  }
+
+                  def add_flag(name, flag):
+                      if name not in cols:
+                          return
+                      if cols[name]["type"] in ("smallint", "integer"):
+                          values[name] = int(flag)
+                      else:
+                          values[name] = bool(flag)
+
+                  add_flag("admin", admin)
+                  add_flag("deactivated", False)
+                  add_flag("shadow_banned", False)
+                  add_flag("is_guest", False)
+
+                  columns = list(values.keys())
+                  placeholders = ", ".join(["%s"] * len(columns))
+                  updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
+                  query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
+                  cur.execute(query, [values[c] for c in columns])
+
              def get_cols(cur):
                  cur.execute(
                      """
@ -138,40 +172,30 @@ spec:
                      }
                  return cols

-              def admin_token_valid(token: str, user_id: str) -> bool:
-                  if not token or not SYNAPSE_ADMIN_URL:
-                      return False
-                  encoded = urllib.parse.quote(user_id, safe="")
-                  url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
-                  req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
-                  try:
-                      with urllib.request.urlopen(req, timeout=30) as resp:
-                          resp.read()
-                      return True
-                  except urllib.error.HTTPError as exc:
-                      if exc.code == 404:
-                          return True
-                      if exc.code in (401, 403):
-                          return False
-                      raise
+              def ensure_access_token(cur, user_id, token_value):
+                  cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
+                  token_id = cur.fetchone()[0]
+                  cur.execute(
+                      """
+                      INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
+                      VALUES (%s, %s, %s, %s, NULL)
+                      ON CONFLICT (token) DO NOTHING
+                      """,
+                      (token_id, user_id, token_value, "ariadne-admin"),
+                  )

              vault_token = vault_login()
              admin_data = ensure_admin_creds(vault_token)
-              user_id = f"@{admin_data['username']}:live.bstein.dev"
-              existing_token = admin_data.get("access_token")
-              if existing_token and admin_token_valid(existing_token, user_id):
-                  log("synapse admin token already present and valid")
+              if admin_data.get("access_token"):
+                  log("synapse admin token already present")
                  raise SystemExit(0)
-              if existing_token:
-                  log("synapse admin token invalid; rotating")
-                  admin_data.pop("access_token", None)
-                  vault_put(vault_token, "comms/synapse-admin", admin_data)

              synapse_db = vault_get(vault_token, "comms/synapse-db")
              pg_password = synapse_db.get("POSTGRES_PASSWORD")
              if not pg_password:
                  raise RuntimeError("synapse db password missing")

+              user_id = f"@{admin_data['username']}:live.bstein.dev"
              conn = psycopg2.connect(
                  host=PGHOST,
                  port=PGPORT,
@ -179,34 +203,17 @@ spec:
                  user=PGUSER,
                  password=pg_password,
              )
+              token_value = secrets.token_urlsafe(32)
              try:
                  with conn:
                      with conn.cursor() as cur:
                          cols = get_cols(cur)
-                          if "admin" not in cols:
-                              raise RuntimeError("users.admin column missing")
-                          cur.execute(
-                              "UPDATE users SET admin = TRUE WHERE name = %s",
-                              (user_id,),
-                          )
-                          cur.execute(
-                              """
-                              SELECT token FROM access_tokens
-                              WHERE user_id = %s AND valid_until_ms IS NULL
-                              ORDER BY id DESC LIMIT 1
-                              """,
-                              (user_id,),
-                          )
-                          row = cur.fetchone()
-                          if not row:
-                              raise RuntimeError(f"no access token found for {user_id}")
-                          token_value = row[0]
+                          ensure_user(cur, cols, user_id, admin_data["password"], True)
+                          ensure_access_token(cur, user_id, token_value)
              finally:
                  conn.close()

              admin_data["access_token"] = token_value
              vault_put(vault_token, "comms/synapse-admin", admin_data)
-              if not admin_token_valid(token_value, user_id):
-                  raise RuntimeError("synapse admin token validation failed")
              log("synapse admin token stored")
              PY
--- a/services/comms/oneoffs/synapse-user-seed-job.yaml
+++ b/services/comms/oneoffs/synapse-user-seed-job.yaml
@ -82,6 +82,8 @@ spec:
              value: synapse
            - name: SEEDER_USER
              value: othrys-seeder
+            - name: BOT_USER
+              value: atlasbot
          command:
            - /bin/sh
            - -c
@ -139,8 +141,10 @@ spec:
                  cur.execute(query, [values[c] for c in columns])

              seeder_user = os.environ["SEEDER_USER"]
+              bot_user = os.environ["BOT_USER"]
              server = "live.bstein.dev"
              seeder_id = f"@{seeder_user}:{server}"
+              bot_id = f"@{bot_user}:{server}"

              conn = psycopg2.connect(
                  host=os.environ["PGHOST"],
@ -154,6 +158,7 @@ spec:
                      with conn.cursor() as cur:
                          cols = get_cols(cur)
                          upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
+                          upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
              finally:
                  conn.close()
              PY
--- a/services/comms/reset-othrys-room-job.yaml
+++ b/services/comms/reset-othrys-room-job.yaml
@ -76,7 +76,7 @@ spec:
                - name: SEEDER_USER
                  value: othrys-seeder
                - name: BOT_USER
-                  value: atlas-smart
+                  value: atlasbot
              command:
                - /bin/sh
                - -c
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -11,12 +11,8 @@ from urllib import error, parse, request

 BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
 AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
-BOT_USER = os.environ["BOT_USER"]
-BOT_PASS = os.environ["BOT_PASS"]
-BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
-BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
-BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
-BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
+USER = os.environ["BOT_USER"]
+PASSWORD = os.environ["BOT_PASS"]
 ROOM_ALIAS = "#othrys:live.bstein.dev"

 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
@ -35,7 +31,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
 ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
 ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")

-BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
+BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
 SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")

 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -397,31 +393,6 @@ def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
    return default


-def _detect_mode(
-    content: dict[str, Any],
-    body: str,
-    *,
-    default: str = "deep",
-    account_user: str = "",
-) -> str:
-    mode = _detect_mode_from_body(body, default=default)
-    mentions = content.get("m.mentions", {})
-    user_ids = mentions.get("user_ids", [])
-    if isinstance(user_ids, list):
-        normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
-        if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
-            return "fast"
-        if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
-            return "deep"
-        if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
-            return "deep"
-    if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
-        return "fast"
-    if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
-        return "deep"
-    return mode
-
-
 def _model_for_mode(mode: str) -> str:
    if mode == "fast" and MODEL_FAST:
        return MODEL_FAST
@ -445,12 +416,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
        raw = resp.read()
        return json.loads(raw.decode()) if raw else {}

-def login(user: str, password: str) -> str:
-    login_user = normalize_user_id(user)
+def login() -> str:
+    login_user = normalize_user_id(USER)
    payload = {
        "type": "m.login.password",
        "identifier": {"type": "m.id.user", "user": login_user},
-        "password": password,
+        "password": PASSWORD,
    }
    res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
    return res["access_token"]
@ -4849,7 +4820,7 @@ def open_ended_with_thinking(
    thread.join(timeout=1)
    return result["reply"] or "Model backend is busy. Try again in a moment."

-def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
+def sync_loop(token: str, room_id: str):
    since = None
    try:
        res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -4890,7 +4861,7 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
                if not body:
                    continue
                sender = ev.get("sender", "")
-                if account_user and sender == normalize_user_id(account_user):
+                if sender == f"@{USER}:live.bstein.dev":
                    continue

                mentioned = is_mentioned(content, body)
@ -4903,12 +4874,7 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)

                cleaned_body = _strip_bot_mention(body)
                lower_body = cleaned_body.lower()
-                mode = _detect_mode(
-                    content,
-                    body,
-                    default=default_mode if default_mode in ("fast", "deep") else "deep",
-                    account_user=account_user,
-                )
+                mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")

                # Only do live cluster introspection in DMs.
                allow_tools = is_dm
@ -4985,65 +4951,26 @@ def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str)
                history[hist_key].append(f"Atlas: {reply}")
                history[hist_key] = history[hist_key][-80:]

-def login_with_retry(user: str, password: str):
+def login_with_retry():
    last_err = None
    for attempt in range(10):
        try:
-            return login(user, password)
+            return login()
        except Exception as exc:  # noqa: BLE001
            last_err = exc
            time.sleep(min(30, 2 ** attempt))
    raise last_err

-def _bot_accounts() -> list[dict[str, str]]:
-    accounts: list[dict[str, str]] = []
-
-    def add(user: str, password: str, mode: str):
-        if not user or not password:
-            return
-        accounts.append({"user": user, "password": password, "mode": mode})
-
-    add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "deep")
-    if BOT_USER_QUICK and BOT_PASS_QUICK:
-        add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
-    if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
-        add(BOT_USER, BOT_PASS, "deep")
-
-    seen: set[str] = set()
-    unique: list[dict[str, str]] = []
-    for acc in accounts:
-        uid = normalize_user_id(acc["user"]).lower()
-        if uid in seen:
-            continue
-        seen.add(uid)
-        unique.append(acc)
-    return unique
-
 def main():
    load_kb()
    _start_http_server()
-    accounts = _bot_accounts()
-    threads: list[threading.Thread] = []
-    for acc in accounts:
-        token = login_with_retry(acc["user"], acc["password"])
-        try:
-            room_id = resolve_alias(token, ROOM_ALIAS)
-            join_room(token, room_id)
-        except Exception:
-            room_id = None
-        thread = threading.Thread(
-            target=sync_loop,
-            args=(token, room_id),
-            kwargs={
-                "account_user": acc["user"],
-                "default_mode": acc["mode"],
-            },
-            daemon=True,
-        )
-        thread.start()
-        threads.append(thread)
-    for thread in threads:
-        thread.join()
+    token = login_with_retry()
+    try:
+        room_id = resolve_alias(token, ROOM_ALIAS)
+        join_room(token, room_id)
+    except Exception:
+        room_id = None
+    sync_loop(token, room_id)

 if __name__ == "__main__":
    main()
--- a/services/comms/scripts/comms_vault_env.sh
+++ b/services/comms/scripts/comms_vault_env.sh
@ -7,14 +7,6 @@ read_secret() {
  tr -d '\r\n' < "${vault_dir}/$1"
 }

-read_optional() {
-  if [ -f "${vault_dir}/$1" ]; then
-    tr -d '\r\n' < "${vault_dir}/$1"
-  else
-    printf ''
-  fi
-}
-
 export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
 export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"

@ -22,15 +14,6 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
 export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"

 export BOT_PASS="$(read_secret bot-pass)"
-export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
-export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
-export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
-if [ -z "${BOT_PASS_SMART}" ]; then
-  export BOT_PASS_SMART="${BOT_PASS}"
-fi
-if [ -z "${BOT_PASS_GENIUS}" ]; then
-  export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
-fi
 export SEEDER_PASS="$(read_secret seeder-pass)"

 export CHAT_API_KEY="$(read_secret chat-matrix)"
--- a/services/comms/seed-othrys-room.yaml
+++ b/services/comms/seed-othrys-room.yaml
@ -66,7 +66,7 @@ spec:
                - name: SEEDER_USER
                  value: othrys-seeder
                - name: BOT_USER
-                  value: atlas-smart
+                  value: atlasbot
              command:
                - /bin/sh
                - -c
--- a/services/crypto/monerod/deployment.yaml
+++ b/services/crypto/monerod/deployment.yaml
@ -29,18 +29,12 @@ spec:
                operator: In
                values: ["rpi4","rpi5"]
          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 80
+            - weight: 50
              preference:
                matchExpressions:
                - key: hardware
                  operator: In
-                  values: ["rpi5"]
-            - weight: 60
-              preference:
-                matchExpressions:
-                - key: kubernetes.io/hostname
-                  operator: NotIn
-                  values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
+                  values: ["rpi4"]
      containers:
      - name: monerod
        image: registry.bstein.dev/crypto/monerod:0.18.4.1
--- a/services/crypto/xmr-miner/xmrig-daemonset.yaml
+++ b/services/crypto/xmr-miner/xmrig-daemonset.yaml
@ -23,7 +23,7 @@ spec:
            - matchExpressions:
              - key: hardware
                operator: In
-                values: ["rpi5"]
+                values: ["rpi4","rpi5"]
      containers:
        - name: xmrig
          image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
--- a/services/gitea/deployment.yaml
+++ b/services/gitea/deployment.yaml
@ -123,22 +123,13 @@ spec:
              - key: hardware
                operator: In
                values: ["rpi4","rpi5"]
-              - key: longhorn
-                operator: NotIn
-                values: ["true"]
          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              preference:
-                matchExpressions:
-                - key: kubernetes.io/hostname
-                  operator: NotIn
-                  values: ["titan-13","titan-15","titan-17","titan-19"]
            - weight: 50
              preference:
                matchExpressions:
                - key: hardware
                  operator: In
-                  values: ["rpi5"]
+                  values: ["rpi4"]
      containers:
        - name: gitea
          image: gitea/gitea:1.23
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@ -245,17 +245,6 @@ spec:
        image:
          repository: registry.bstein.dev/infra/harbor-registry
          tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
-        extraEnvVars:
-          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
-            value: harbor-core
-          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
-            value: http://harbor-registry:8080/service/notifications
-          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
-            value: 5s
-          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
-            value: "5"
-          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
-            value: 1s
      controller:
        image:
          repository: registry.bstein.dev/infra/harbor-registryctl
@ -274,10 +263,6 @@ spec:
          export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
          export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
          {{ end }}
-          {{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
-          export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
-          export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
-          {{ end }}
        vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
        vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
          {{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -412,10 +397,10 @@ spec:
            patch: |-
              - op: replace
                path: /spec/rules/0/http/paths/2/backend/service/name
-                value: harbor-core
+                value: harbor-registry
              - op: replace
                path: /spec/rules/0/http/paths/2/backend/service/port/number
-                value: 80
+                value: 5000
          - target:
              kind: Deployment
              name: harbor-jobservice
@ -479,16 +464,6 @@ spec:
                            value: /vault/secrets/harbor-registry-env.sh
                          - name: VAULT_COPY_FILES
                            value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
-                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
-                            value: harbor-core
-                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
-                            value: http://harbor-registry:8080/service/notifications
-                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
-                            value: 5s
-                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
-                            value: "5"
-                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
-                            value: 1s
                        envFrom:
                          - $patch: replace
                        volumeMounts:
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@ -67,7 +67,7 @@ data:
                      url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
                      credentials('gitea-pat')
                    }
-                    branches('*/main')
+                    branches('*/master')
                  }
                }
              }
@ -108,7 +108,7 @@ data:
                      url('https://scm.bstein.dev/bstein/ci-demo.git')
                      credentials('gitea-pat')
                    }
-                    branches('*/main')
+                    branches('*/master')
                  }
                }
                scriptPath('Jenkinsfile')
@ -167,58 +167,6 @@ data:
              }
            }
          }
-          pipelineJob('atlasbot') {
-            properties {
-              pipelineTriggers {
-                triggers {
-                  scmTrigger {
-                    scmpoll_spec('H/2 * * * *')
-                    ignorePostCommitHooks(false)
-                  }
-                }
-              }
-            }
-            definition {
-              cpsScm {
-                scm {
-                  git {
-                    remote {
-                      url('https://scm.bstein.dev/bstein/atlasbot.git')
-                      credentials('gitea-pat')
-                    }
-                    branches('*/main')
-                  }
-                }
-                scriptPath('Jenkinsfile')
-              }
-            }
-          }
-          pipelineJob('Soteria') {
-            properties {
-              pipelineTriggers {
-                triggers {
-                  scmTrigger {
-                    scmpoll_spec('H/5 * * * *')
-                    ignorePostCommitHooks(false)
-                  }
-                }
-              }
-            }
-            definition {
-              cpsScm {
-                scm {
-                  git {
-                    remote {
-                      url('https://scm.bstein.dev/bstein/soteria.git')
-                      credentials('gitea-pat')
-                    }
-                    branches('*/main')
-                  }
-                }
-                scriptPath('Jenkinsfile')
-              }
-            }
-          }
          pipelineJob('data-prepper') {
            properties {
              pipelineTriggers {
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@ -48,7 +48,7 @@ spec:
          TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
          GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
          {{ end }}
-        bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
+        bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
    spec:
      serviceAccountName: jenkins
      nodeSelector:
--- a/services/jenkins/dind-pvc.yaml
+++ b/services/jenkins/dind-pvc.yaml
@ -1,13 +0,0 @@
-# services/jenkins/dind-pvc.yaml
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: jenkins-dind-cache
-  namespace: jenkins
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 30Gi
-  storageClassName: astreae
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@ -8,7 +8,6 @@ resources:
  - vault-serviceaccount.yaml
  - pvc.yaml
  - cache-pvc.yaml
-  - dind-pvc.yaml
  - plugins-pvc.yaml
  - configmap-jcasc.yaml
  - configmap-plugins.yaml
--- a/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
@ -1,12 +1,12 @@
 # services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
-# One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
-# Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
+# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
+# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-portal-e2e-execute-actions-email-18
+  name: keycloak-portal-e2e-execute-actions-email-14
  namespace: sso
 spec:
  suspend: true
@ -70,7 +70,7 @@ spec:
            - name: E2E_PROBE_USERNAME
              value: robotuser
            - name: E2E_PROBE_EMAIL
-              value: brad.stein+robot@gmail.com
+              value: robotuser@bstein.dev
            - name: EXECUTE_ACTIONS_CLIENT_ID
              value: bstein-dev-home
            - name: EXECUTE_ACTIONS_REDIRECT_URI
--- a/services/keycloak/oneoffs/realm-settings-job.yaml
+++ b/services/keycloak/oneoffs/realm-settings-job.yaml
@ -1,12 +1,12 @@
 # services/keycloak/oneoffs/realm-settings-job.yaml
-# One-off job for sso/keycloak-realm-settings-38.
-# Purpose: keycloak realm settings 38 (see container args/env in this file).
+# One-off job for sso/keycloak-realm-settings-36.
+# Purpose: keycloak realm settings 36 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-38
+  name: keycloak-realm-settings-36
  namespace: sso
 spec:
  suspend: true
@ -64,7 +64,7 @@ spec:
            - name: KEYCLOAK_REALM
              value: atlas
            - name: KEYCLOAK_SMTP_HOST
-              value: smtp.postmarkapp.com
+              value: mail.bstein.dev
            - name: KEYCLOAK_SMTP_PORT
              value: "587"
            - name: KEYCLOAK_SMTP_FROM
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@ -18,7 +18,6 @@ spec:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
-        maintenance.bstein.dev/restart-rev: "20260207-2"
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "maintenance"
        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -106,7 +105,7 @@ spec:
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: ariadne
-          image: registry.bstein.dev/bstein/ariadne:latest
+          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
          imagePullPolicy: Always
          command: ["/bin/sh", "-c"]
          args:
@ -286,7 +285,7 @@ spec:
            - name: ARIADNE_SCHEDULE_MAILU_SYNC
              value: "30 4 * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
-              value: "*/15 * * * *"
+              value: "0 5 * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -294,11 +293,11 @@ spec:
            - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
              value: "0 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
-              value: "*/15 * * * *"
+              value: "0 5 * * *"
            - name: ARIADNE_SCHEDULE_WGER_ADMIN
              value: "15 3 * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
-              value: "*/15 * * * *"
+              value: "0 6 * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_CRON
              value: "0 3 * * *"
            - name: ARIADNE_SCHEDULE_POD_CLEANER
@ -306,11 +305,11 @@ spec:
            - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
              value: "23 3 * * *"
            - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
-              value: "30 4 * * *"
+              value: "30 4 * * 0"
            - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
-              value: "*/15 * * * *"
+              value: "0 * * * *"
            - name: ARIADNE_SCHEDULE_VAULT_OIDC
-              value: "*/15 * * * *"
+              value: "0 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -331,8 +330,6 @@ spec:
              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
            - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
              value: "5"
-            - name: ARIADNE_ALERTMANAGER_URL
-              value: http://alertmanager.monitoring.svc.cluster.local
            - name: OPENSEARCH_URL
              value: http://opensearch-master.logging.svc.cluster.local:9200
            - name: OPENSEARCH_LIMIT_BYTES
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@ -29,29 +29,6 @@ rules:
      - get
      - list
      - watch
-  - apiGroups: ["apps"]
-    resources:
-      - deployments
-      - statefulsets
-      - daemonsets
-    verbs:
-      - get
-      - list
-      - watch
-  - apiGroups: ["longhorn.io"]
-    resources:
-      - volumes
-    verbs:
-      - get
-      - list
-      - watch
-  - apiGroups: [""]
-    resources:
-      - events
-    verbs:
-      - get
-      - list
-      - watch
  - apiGroups: [""]
    resources:
      - pods/exec
@ -79,17 +56,3 @@ roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: ariadne-job-spawner
-
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: ariadne-auth-delegator
-subjects:
-  - kind: ServiceAccount
-    name: ariadne
-    namespace: maintenance
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: system:auth-delegator
--- a/services/maintenance/image.yaml
+++ b/services/maintenance/image.yaml
@ -21,26 +21,3 @@ spec:
  policy:
    semver:
      range: ">=0.1.0-0"
---
-apiVersion: image.toolkit.fluxcd.io/v1beta2
-kind: ImageRepository
-metadata:
-  name: soteria
-  namespace: maintenance
-spec:
-  image: registry.bstein.dev/bstein/soteria
-  interval: 1m0s
-  secretRef:
-    name: harbor-regcred
---
-apiVersion: image.toolkit.fluxcd.io/v1beta2
-kind: ImagePolicy
-metadata:
-  name: soteria
-  namespace: maintenance
-spec:
-  imageRepositoryRef:
-    name: soteria
-  policy:
-    semver:
-      range: ">=0.1.0-0"
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@ -5,7 +5,6 @@ resources:
  - namespace.yaml
  - image.yaml
  - secretproviderclass.yaml
-  - soteria-configmap.yaml
  - vault-serviceaccount.yaml
  - vault-sync-deployment.yaml
  - ariadne-serviceaccount.yaml
@ -14,12 +13,9 @@ resources:
  - k3s-traefik-cleanup-rbac.yaml
  - node-nofile-serviceaccount.yaml
  - pod-cleaner-rbac.yaml
-  - soteria-serviceaccount.yaml
-  - soteria-rbac.yaml
  - ariadne-deployment.yaml
  - oneoffs/ariadne-migrate-job.yaml
  - ariadne-service.yaml
-  - soteria-deployment.yaml
  - disable-k3s-traefik-daemonset.yaml
  - oneoffs/k3s-traefik-cleanup-job.yaml
  - node-nofile-daemonset.yaml
@ -28,12 +24,9 @@ resources:
  - node-image-sweeper-serviceaccount.yaml
  - node-image-sweeper-daemonset.yaml
  - image-sweeper-cronjob.yaml
-  - soteria-service.yaml
 images:
  - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
-  - name: registry.bstein.dev/bstein/soteria
-    newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
+    newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
 configMapGenerator:
  - name: disable-k3s-traefik-script
    namespace: maintenance
--- a/services/maintenance/soteria-configmap.yaml
+++ b/services/maintenance/soteria-configmap.yaml
@ -1,10 +0,0 @@
-# services/maintenance/soteria-configmap.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: soteria
-  namespace: maintenance
-data:
-  SOTERIA_BACKUP_DRIVER: "longhorn"
-  SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
-  SOTERIA_LONGHORN_BACKUP_MODE: "incremental"
--- a/services/maintenance/soteria-deployment.yaml
+++ b/services/maintenance/soteria-deployment.yaml
@ -1,73 +0,0 @@
-# services/maintenance/soteria-deployment.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: soteria
-  namespace: maintenance
-spec:
-  replicas: 1
-  revisionHistoryLimit: 3
-  selector:
-    matchLabels:
-      app: soteria
-  template:
-    metadata:
-      labels:
-        app: soteria
-    spec:
-      serviceAccountName: soteria
-      nodeSelector:
-        kubernetes.io/arch: arm64
-        node-role.kubernetes.io/worker: "true"
-      affinity:
-        nodeAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 90
-              preference:
-                matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values: ["rpi5"]
-            - weight: 50
-              preference:
-                matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values: ["rpi4"]
-      containers:
-        - name: soteria
-          image: registry.bstein.dev/bstein/soteria:latest
-          imagePullPolicy: Always
-          ports:
-            - name: http
-              containerPort: 8080
-          envFrom:
-            - configMapRef:
-                name: soteria
-          livenessProbe:
-            httpGet:
-              path: /healthz
-              port: http
-            initialDelaySeconds: 5
-            periodSeconds: 10
-            timeoutSeconds: 2
-          readinessProbe:
-            httpGet:
-              path: /readyz
-              port: http
-            initialDelaySeconds: 2
-            periodSeconds: 5
-            timeoutSeconds: 2
-          resources:
-            requests:
-              cpu: 50m
-              memory: 64Mi
-            limits:
-              cpu: 200m
-              memory: 256Mi
-          securityContext:
-            allowPrivilegeEscalation: false
-            runAsNonRoot: true
-            runAsUser: 65532
-            capabilities:
-              drop: ["ALL"]
--- a/services/maintenance/soteria-rbac.yaml
+++ b/services/maintenance/soteria-rbac.yaml
@ -1,22 +0,0 @@
-# services/maintenance/soteria-rbac.yaml
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: soteria
-rules:
-  - apiGroups: [""]
-    resources: ["persistentvolumeclaims", "persistentvolumes"]
-    verbs: ["get", "list"]
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: soteria
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: soteria
-subjects:
-  - kind: ServiceAccount
-    name: soteria
-    namespace: maintenance
--- a/services/maintenance/soteria-service.yaml
+++ b/services/maintenance/soteria-service.yaml
@ -1,14 +0,0 @@
-# services/maintenance/soteria-service.yaml
-apiVersion: v1
-kind: Service
-metadata:
-  name: soteria
-  namespace: maintenance
-spec:
-  type: ClusterIP
-  selector:
-    app: soteria
-  ports:
-    - name: http
-      port: 80
-      targetPort: http
--- a/services/maintenance/soteria-serviceaccount.yaml
+++ b/services/maintenance/soteria-serviceaccount.yaml
@ -1,8 +0,0 @@
-# services/maintenance/soteria-serviceaccount.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: soteria
-  namespace: maintenance
-imagePullSecrets:
-  - name: harbor-regcred
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@ -20,7 +20,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -89,7 +89,7 @@
      },
      "targets": [
        {
-          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
+          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -1901,7 +1901,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@ -145,7 +145,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
+                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
                  legendFormat: '{{instance}}'
                  datasource:
                    type: prometheus
@ -286,56 +286,8 @@ data:
              summary: "node-image-sweeper not fully ready"
            labels:
              severity: warning
-          - uid: maint-ariadne-image-sweeper-stale
-            title: "Ariadne image sweeper stale (schedule >8d)"
-            condition: C
-            for: "5m"
-            data:
-              - refId: A
-                relativeTimeRange:
-                  from: 300
-                  to: 0
-                datasourceUid: atlas-vm
-                model:
-                  expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  legendFormat: '{{task}}'
-                  datasource:
-                    type: prometheus
-                    uid: atlas-vm
-              - refId: B
-                datasourceUid: __expr__
-                model:
-                  expression: A
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  reducer: last
-                  type: reduce
-              - refId: C
-                datasourceUid: __expr__
-                model:
-                  expression: B
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  type: threshold
-                  conditions:
-                    - evaluator:
-                        params: [691200]
-                        type: gt
-                      operator:
-                        type: and
-                      reducer:
-                        type: last
-                      type: query
-            noDataState: OK
-            execErrState: Error
-            annotations:
-              summary: "Ariadne image sweeper stale >8d since last success"
-            labels:
-              severity: warning
          - uid: maint-cron-stale
-            title: "Maintenance CronJobs stale (legacy disabled)"
+            title: "Maintenance CronJobs stale (>3h since success)"
            condition: C
            for: "5m"
            data:
@ -345,10 +297,10 @@ data:
                  to: 0
                datasourceUid: atlas-vm
                model:
-                  expr: vector(0)
+                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  legendFormat: legacy
+                  legendFormat: '{{cronjob}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
@ -369,118 +321,17 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [1]
+                        params: [10800]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
-            noDataState: OK
-            execErrState: OK
-            annotations:
-              summary: "Legacy cronjob alert disabled"
-            labels:
-              severity: info
-      - orgId: 1
-        name: ariadne
-        folder: Alerts
-        interval: 1m
-        rules:
-          - uid: ariadne-schedule-error
-            title: "Ariadne schedule task failed"
-            condition: C
-            for: "10m"
-            data:
-              - refId: A
-                relativeTimeRange:
-                  from: 300
-                  to: 0
-                datasourceUid: atlas-vm
-                model:
-                  expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  legendFormat: '{{task}}'
-                  datasource:
-                    type: prometheus
-                    uid: atlas-vm
-              - refId: B
-                datasourceUid: __expr__
-                model:
-                  expression: A
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  reducer: last
-                  type: reduce
-              - refId: C
-                datasourceUid: __expr__
-                model:
-                  expression: B
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  type: threshold
-                  conditions:
-                    - evaluator:
-                        params: [1]
-                        type: lt
-                      operator:
-                        type: and
-                      reducer:
-                        type: last
-                      type: query
-            noDataState: OK
+            noDataState: NoData
            execErrState: Error
            annotations:
-              summary: "Ariadne schedule failed ({{ $labels.task }})"
-            labels:
-              severity: warning
-          - uid: ariadne-scheduler-stalled
-            title: "Ariadne scheduler behind (>15m)"
-            condition: C
-            for: "10m"
-            data:
-              - refId: A
-                relativeTimeRange:
-                  from: 300
-                  to: 0
-                datasourceUid: atlas-vm
-                model:
-                  expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  legendFormat: '{{task}}'
-                  datasource:
-                    type: prometheus
-                    uid: atlas-vm
-              - refId: B
-                datasourceUid: __expr__
-                model:
-                  expression: A
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  reducer: last
-                  type: reduce
-              - refId: C
-                datasourceUid: __expr__
-                model:
-                  expression: B
-                  intervalMs: 60000
-                  maxDataPoints: 43200
-                  type: threshold
-                  conditions:
-                    - evaluator:
-                        params: [900]
-                        type: gt
-                      operator:
-                        type: and
-                      reducer:
-                        type: last
-                      type: query
-            noDataState: OK
-            execErrState: Error
-            annotations:
-              summary: "Ariadne scheduler behind for {{ $labels.task }}"
+              summary: "Maintenance cronjob stale >3h since last success"
            labels:
              severity: warning
      - orgId: 1
@ -501,7 +352,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: postmark_outbound_bounce_rate{window="1d"}
+                  expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
                  legendFormat: bounce 1d
                  datasource:
                    type: prometheus
@ -549,7 +400,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: min_over_time(max by (instance) (postmark_api_up)[5m])
+                  expr: POSTMARK_API_UP
                  legendFormat: api up
                  datasource:
                    type: prometheus
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@ -29,7 +29,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -98,7 +98,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
+              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -1910,7 +1910,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -286,7 +286,7 @@ spec:
    podAnnotations:
      vault.hashicorp.com/agent-inject: "true"
      vault.hashicorp.com/role: "monitoring"
-      monitoring.bstein.dev/restart-rev: "4"
+      monitoring.bstein.dev/restart-rev: "1"
      vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
      vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
        {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@ -43,12 +43,6 @@ spec:
                  value: /var/run/secrets/vault-token-reviewer/token
                - name: VAULT_K8S_ROLE_TTL
                  value: 1h
-                - name: VAULT_K8S_BOUND_AUDIENCES
-                  value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
-                - name: VAULT_K8S_ISSUER
-                  value: https://kubernetes.default.svc.cluster.local
-                - name: VAULT_K8S_DISABLE_ISS_VALIDATION
-                  value: "false"
              volumeMounts:
                - name: k8s-auth-config-script
                  mountPath: /scripts
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@ -53,8 +53,6 @@ ensure_token
 k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
 k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
 k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
-k8s_issuer="${VAULT_K8S_ISSUER:-}"
-disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
 role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
 token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"

@ -70,36 +68,11 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
  vault_cmd auth enable kubernetes
 fi

-ensure_default_policy_login() {
-  default_policy="$(vault_cmd policy read default)"
-  if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
-    return
-  fi
-  log "updating default policy to allow kubernetes login"
-  default_policy="${default_policy}
-path \"auth/kubernetes/login\" {
-  capabilities = [\"create\", \"update\"]
-}
-"
-  printf '%s\n' "${default_policy}" | vault_cmd policy write default -
-}
-
 log "configuring kubernetes auth"
-if [ -n "${k8s_issuer}" ]; then
-  vault_cmd write auth/kubernetes/config \
-    token_reviewer_jwt="${token_reviewer_jwt}" \
-    kubernetes_host="${k8s_host}" \
-    kubernetes_ca_cert="${k8s_ca}" \
-    issuer="${k8s_issuer}" \
-    disable_iss_validation="${disable_iss_validation}"
-else
-  vault_cmd write auth/kubernetes/config \
-    token_reviewer_jwt="${token_reviewer_jwt}" \
-    kubernetes_host="${k8s_host}" \
-    kubernetes_ca_cert="${k8s_ca}"
-fi
-
-ensure_default_policy_login
+vault_cmd write auth/kubernetes/config \
+  token_reviewer_jwt="${token_reviewer_jwt}" \
+  kubernetes_host="${k8s_host}" \
+  kubernetes_ca_cert="${k8s_ca}"

 write_raw_policy() {
  name="$1"
@ -114,7 +87,6 @@ write_policy_and_role() {
  service_accounts="$3"
  read_paths="$4"
  write_paths="$5"
-  audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"

  policy_body=""
  for path in ${read_paths}; do
@ -137,42 +109,11 @@ path \"kv/metadata/atlas/${path}\" {
 }
 "
  done
-  if [ "${role}" = "maintenance" ]; then
-    policy_body="${policy_body}
-path \"sys/auth\" {
-  capabilities = [\"read\"]
-}
-path \"sys/auth/*\" {
-  capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
-}
-path \"auth/kubernetes/*\" {
-  capabilities = [\"create\", \"update\", \"read\"]
-}
-path \"auth/oidc/*\" {
-  capabilities = [\"create\", \"update\", \"read\"]
-}
-path \"sys/policies/acl\" {
-  capabilities = [\"list\"]
-}
-path \"sys/policies/acl/*\" {
-  capabilities = [\"create\", \"update\", \"read\"]
-}
-"
-  fi

  log "writing policy ${role}"
  printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -

  log "writing role ${role}"
-  if [ -n "${audiences}" ]; then
-    vault_cmd write "auth/kubernetes/role/${role}" \
-      bound_service_account_audiences="${audiences}" \
-      bound_service_account_names="${service_accounts}" \
-      bound_service_account_namespaces="${namespace}" \
-      policies="${role}" \
-      ttl="${role_ttl}"
-    return
-  fi
  vault_cmd write "auth/kubernetes/role/${role}" \
    bound_service_account_names="${service_accounts}" \
    bound_service_account_namespaces="${namespace}" \
@ -277,8 +218,6 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
  "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
-write_policy_and_role "ai" "ai" "atlasbot" \
-  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
 write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
  "jenkins/* shared/harbor-pull" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
@ -292,7 +231,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
  "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
+  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
  "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \