maintenance: harden metis recovery and fix harbor rollout

maintenance/jenkins: align Metis ingress, sentinel push, and CI job
maintenance: add Metis service and sentinel manifests
2026-03-31 14:51:49 -03:00 · 2026-03-31 14:21:53 -03:00 · 2026-03-31 14:07:17 -03:00 · 2026-03-31 13:54:04 -03:00 · 2026-03-30 18:41:21 -03:00 · 2026-03-30 18:40:59 -03:00
94 changed files with 8133 additions and 383 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 !README.md
 !knowledge/**/*.md
 !services/comms/knowledge/**/*.md
 !services/atlasbot/knowledge/**/*.md
 __pycache__/
 *.py[cod]
 .pytest_cache
--- a/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
@ -0,0 +1,26 @@
 # clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
 apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
  name: atlasbot
  namespace: ai
 spec:
  interval: 1m0s
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  git:
    checkout:
      ref:
        branch: feature/atlasbot
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(atlasbot): automated image update"
    push:
      branch: feature/atlasbot
  update:
    strategy: Setters
    path: services/atlasbot
--- a/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
@ -0,0 +1,17 @@
 # clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: atlasbot
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  path: ./services/atlasbot
  targetNamespace: ai
  timeout: 2m
  dependsOn:
    - name: ai-llm
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/ariadne
+        branch: feature/atlasbot
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(bstein-dev-home): automated image update"
    push:
-      branch: feature/ariadne
+      branch: feature/atlasbot
  update:
    strategy: Setters
    path: services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/comms/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/comms/image-automation.yaml
@ -0,0 +1,26 @@
 # clusters/atlas/flux-system/applications/comms/image-automation.yaml
 apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
  name: comms
  namespace: comms
 spec:
  interval: 1m0s
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  git:
    checkout:
      ref:
        branch: feature/atlasbot
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(comms): automated image update"
    push:
      branch: feature/atlasbot
  update:
    strategy: Setters
    path: services/comms
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -6,6 +6,9 @@ resources:
  - vault/kustomization.yaml
  - vaultwarden/kustomization.yaml
  - comms/kustomization.yaml
  - comms/image-automation.yaml
  - atlasbot/kustomization.yaml
  - atlasbot/image-automation.yaml
  - crypto/kustomization.yaml
  - monerod/kustomization.yaml
  - pegasus/kustomization.yaml
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@ -9,7 +9,7 @@ metadata:
 spec:
  interval: 1m0s
  ref:
-    branch: feature/ariadne
+    branch: feature/atlasbot
  secretRef:
    name: flux-system-gitea
  url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@ -16,5 +16,6 @@ resources:
  - longhorn/kustomization.yaml
  - longhorn-ui/kustomization.yaml
  - postgres/kustomization.yaml
  - nats/kustomization.yaml
  - ../platform/vault-csi/kustomization.yaml
  - ../platform/vault-injector/kustomization.yaml
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/ariadne
+        branch: feature/atlasbot
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(maintenance): automated image update"
    push:
-      branch: feature/ariadne
+      branch: feature/atlasbot
  update:
    strategy: Setters
    path: services/maintenance
--- a/clusters/atlas/flux-system/platform/nats/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/nats/kustomization.yaml
@ -0,0 +1,21 @@
 # clusters/atlas/flux-system/platform/nats/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: nats
  namespace: flux-system
 spec:
  interval: 10m
  path: ./infrastructure/nats
  prune: true
  force: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  targetNamespace: nats
  healthChecks:
    - apiVersion: apps/v1
      kind: StatefulSet
      name: nats
      namespace: nats
  wait: true
--- a/dockerfiles/Dockerfile.synapse-admin-ensure
+++ b/dockerfiles/Dockerfile.synapse-admin-ensure
@ -0,0 +1,3 @@
 FROM python:3.11-slim
 RUN pip install --no-cache-dir psycopg2-binary bcrypt
--- a/infrastructure/core/kustomization.yaml
+++ b/infrastructure/core/kustomization.yaml
@ -6,6 +6,7 @@ resources:
  - ../modules/profiles/atlas-ha
  - coredns-custom.yaml
  - coredns-deployment.yaml
  - longhorn-node-taints.yaml
  - ntp-sync-daemonset.yaml
  - ../sources/cert-manager/letsencrypt.yaml
  - ../sources/cert-manager/letsencrypt-prod.yaml
--- a/infrastructure/core/longhorn-node-taints.yaml
+++ b/infrastructure/core/longhorn-node-taints.yaml
@ -0,0 +1,40 @@
 # infrastructure/core/longhorn-node-taints.yaml
 apiVersion: v1
 kind: Node
 metadata:
  name: titan-13
 spec:
  taints:
    - key: longhorn
      value: "true"
      effect: PreferNoSchedule
 ---
 apiVersion: v1
 kind: Node
 metadata:
  name: titan-15
 spec:
  taints:
    - key: longhorn
      value: "true"
      effect: PreferNoSchedule
 ---
 apiVersion: v1
 kind: Node
 metadata:
  name: titan-17
 spec:
  taints:
    - key: longhorn
      value: "true"
      effect: PreferNoSchedule
 ---
 apiVersion: v1
 kind: Node
 metadata:
  name: titan-19
 spec:
  taints:
    - key: longhorn
      value: "true"
      effect: PreferNoSchedule
--- a/infrastructure/longhorn/core/backup-target.yaml
+++ b/infrastructure/longhorn/core/backup-target.yaml
@ -0,0 +1,10 @@
 # infrastructure/longhorn/core/backup-target.yaml
 apiVersion: longhorn.io/v1beta2
 kind: BackupTarget
 metadata:
  name: default
  namespace: longhorn-system
 spec:
  backupTargetURL: "s3://atlas-soteria@us-west-004/"
  credentialSecret: longhorn-backup-b2
  pollInterval: 5m0s
--- a/infrastructure/longhorn/core/helmrelease.yaml
+++ b/infrastructure/longhorn/core/helmrelease.yaml
@ -6,6 +6,39 @@ metadata:
  namespace: longhorn-system
 spec:
  interval: 30m
  postRenderers:
    - kustomize:
        patches:
          - target:
              kind: Service
              name: longhorn-conversion-webhook
              namespace: longhorn-system
            patch: |
              - op: add
                path: /spec/publishNotReadyAddresses
                value: true
          - target:
              kind: Service
              name: longhorn-admission-webhook
              namespace: longhorn-system
            patch: |
              - op: add
                path: /spec/publishNotReadyAddresses
                value: true
          - target:
              kind: DaemonSet
              name: longhorn-manager
              namespace: longhorn-system
            patch: |
              - op: replace
                path: /spec/template/spec/containers/0/readinessProbe/httpGet/path
                value: /v1/healthz
              - op: replace
                path: /spec/template/spec/containers/0/readinessProbe/httpGet/port
                value: 9500
              - op: replace
                path: /spec/template/spec/containers/0/readinessProbe/httpGet/scheme
                value: HTTP
  chart:
    spec:
      chart: longhorn
@ -34,7 +67,7 @@ spec:
      createSecret: false
      registrySecret: longhorn-registry
    image:
-      pullPolicy: Always
+      pullPolicy: IfNotPresent
      longhorn:
        engine:
          repository: registry.bstein.dev/infra/longhorn-engine
@ -77,4 +110,4 @@ spec:
          repository: registry.bstein.dev/infra/longhorn-livenessprobe
          tag: v2.16.0
    defaultSettings:
-      systemManagedPodsImagePullPolicy: Always
+      systemManagedPodsImagePullPolicy: IfNotPresent
--- a/infrastructure/longhorn/core/kustomization.yaml
+++ b/infrastructure/longhorn/core/kustomization.yaml
@ -6,6 +6,7 @@ resources:
  - vault-serviceaccount.yaml
  - secretproviderclass.yaml
  - vault-sync-deployment.yaml
  - backup-target.yaml
  - helmrelease.yaml
  - longhorn-settings-ensure-job.yaml
--- a/infrastructure/longhorn/core/secretproviderclass.yaml
+++ b/infrastructure/longhorn/core/secretproviderclass.yaml
@ -13,9 +13,27 @@ spec:
      - objectName: "harbor-pull__dockerconfigjson"
        secretPath: "kv/data/atlas/shared/harbor-pull"
        secretKey: "dockerconfigjson"
      - objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
        secretPath: "kv/data/atlas/longhorn/backup-b2"
        secretKey: "AWS_ACCESS_KEY_ID"
      - objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
        secretPath: "kv/data/atlas/longhorn/backup-b2"
        secretKey: "AWS_SECRET_ACCESS_KEY"
      - objectName: "longhorn_backup__AWS_ENDPOINTS"
        secretPath: "kv/data/atlas/longhorn/backup-b2"
        secretKey: "AWS_ENDPOINTS"
  secretObjects:
    - secretName: longhorn-registry
      type: kubernetes.io/dockerconfigjson
      data:
        - objectName: harbor-pull__dockerconfigjson
          key: .dockerconfigjson
    - secretName: longhorn-backup-b2
      type: Opaque
      data:
        - objectName: longhorn_backup__AWS_ACCESS_KEY_ID
          key: AWS_ACCESS_KEY_ID
        - objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
          key: AWS_SECRET_ACCESS_KEY
        - objectName: longhorn_backup__AWS_ENDPOINTS
          key: AWS_ENDPOINTS
--- a/infrastructure/nats/configmap.yaml
+++ b/infrastructure/nats/configmap.yaml
@ -0,0 +1,17 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: nats-config
  namespace: nats
  labels:
    app: nats
    component: config
  annotations:
    description: "NATS JetStream configuration"
 data:
  nats.conf: |
    jetstream {
      store_dir: /data
      max_mem_store: 128MB
      max_file_store: 1GB
    }
--- a/infrastructure/nats/kustomization.yaml
+++ b/infrastructure/nats/kustomization.yaml
@ -0,0 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - configmap.yaml
  - service.yaml
  - statefulset.yaml
--- a/infrastructure/nats/namespace.yaml
+++ b/infrastructure/nats/namespace.yaml
@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: nats
--- a/infrastructure/nats/service.yaml
+++ b/infrastructure/nats/service.yaml
@ -0,0 +1,17 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: nats
  namespace: nats
  labels:
    app: nats
 spec:
  selector:
    app: nats
  ports:
    - name: client
      port: 4222
      targetPort: 4222
    - name: monitoring
      port: 8222
      targetPort: 8222
--- a/infrastructure/nats/statefulset.yaml
+++ b/infrastructure/nats/statefulset.yaml
@ -0,0 +1,54 @@
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
  name: nats
  namespace: nats
  labels:
    app: nats
 spec:
  serviceName: nats
  replicas: 1
  selector:
    matchLabels:
      app: nats
  template:
    metadata:
      labels:
        app: nats
    spec:
      containers:
        - name: nats
          image: nats:2.10.18
          args:
            - "-c"
            - "/etc/nats/nats.conf"
          ports:
            - name: client
              containerPort: 4222
            - name: monitoring
              containerPort: 8222
          volumeMounts:
            - name: config
              mountPath: /etc/nats
            - name: data
              mountPath: /data
          resources:
            requests:
              cpu: 100m
              memory: 256Mi
            limits:
              cpu: 500m
              memory: 512Mi
      volumes:
        - name: config
          configMap:
            name: nats-config
  volumeClaimTemplates:
    - metadata:
        name: data
      spec:
        accessModes:
          - ReadWriteOnce
        resources:
          requests:
            storage: 2Gi
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -47,6 +47,7 @@ PERCENT_THRESHOLDS = {
 }
 NAMESPACE_CPU_WINDOW = "1m"
 GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"
 # ---------------------------------------------------------------------------
 # Cluster metadata
@ -235,13 +236,16 @@ def gpu_util_by_hostname():
 def gpu_node_labels():
-    return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
+    return (
        f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
        ' or kube_node_labels{label_jetson="true"}'
    )
 def gpu_requests_by_namespace_node(scope_var):
    return (
        "sum by (namespace,node) ("
-        f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
+        f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
        "* on(namespace,pod) group_left(node) kube_pod_info "
        f"* on(node) group_left() ({gpu_node_labels()})"
        ")"
@ -253,7 +257,7 @@ def gpu_usage_by_namespace(scope_var):
    total_by_node = f"sum by (node) ({requests_by_ns})"
    return (
        "sum by (namespace) ("
-        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
+        f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
        f"* on(node) group_left() ({gpu_util_by_node()})"
        ")"
    )
@ -419,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
    "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
 )
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
-ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
+TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
-ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
+TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
-ARIADNE_TEST_SUCCESS_RATE = (
+TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
 TEST_SUCCESS_RATE = (
    "100 * "
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
+    f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
    "/ clamp_min("
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
+    f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
 )
-ARIADNE_TEST_FAILURES_24H = (
+TEST_FAILURES_24H = (
-    'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
+    f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
 )
 POSTGRES_CONN_USED = (
    'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1290,23 +1295,25 @@ def build_overview():
            },
        }
    )
-    panels.append(
+    test_success = timeseries_panel(
        timeseries_panel(
        42,
-            "Ariadne Test Success Rate",
+        "Platform Test Success Rate",
-            ARIADNE_TEST_SUCCESS_RATE,
+        TEST_SUCCESS_RATE,
        {"h": 6, "w": 6, "x": 12, "y": 14},
        unit="percent",
        max_value=100,
        legend=None,
        legend_display="list",
    )
    test_success["description"] = (
        "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
        "Add new test series there first so they roll up here."
    )
-    panels.append(
+    panels.append(test_success)
-        bargauge_panel(
+    test_failures = bargauge_panel(
        43,
-            "Tests with Failures (24h)",
+        "Platform Tests with Failures (24h)",
-            ARIADNE_TEST_FAILURES_24H,
+        TEST_FAILURES_24H,
        {"h": 6, "w": 6, "x": 18, "y": 14},
        unit="none",
        instant=True,
@ -1331,7 +1338,10 @@ def build_overview():
            ],
        },
    )
    test_failures["description"] = (
        "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
    )
    panels.append(test_failures)
    cpu_scope = "$namespace_scope_cpu"
    gpu_scope = "$namespace_scope_gpu"
@ -2649,29 +2659,31 @@ def build_jobs_dashboard():
            legend="{{status}}",
        )
    )
-    panels.append(
+    coverage_panel = stat_panel(
        stat_panel(
        17,
-            "Ariadne CI Coverage (%)",
+        "Platform CI Coverage (%)",
-            ARIADNE_CI_COVERAGE,
+        TEST_CI_COVERAGE,
        {"h": 6, "w": 4, "x": 8, "y": 11},
        unit="percent",
        decimals=1,
        instant=True,
        legend="{{branch}}",
    )
-    )
+    coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups."
-    panels.append(
+    panels.append(coverage_panel)
-        table_panel(
+    tests_panel = table_panel(
        18,
-            "Ariadne CI Tests (latest)",
+        "Platform CI Tests (latest)",
-            ARIADNE_CI_TESTS,
+        TEST_CI_TESTS,
        {"h": 6, "w": 12, "x": 12, "y": 11},
        unit="none",
        transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
        instant=True,
    )
    tests_panel["description"] = (
        "Atlas Overview test panels depend on these internal repo-tagged CI series."
    )
    panels.append(tests_panel)
    return {
        "uid": "atlas-jobs",
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@ -539,9 +539,9 @@ def main() -> int:
        help="Write generated files (otherwise just print a summary).",
    )
    ap.add_argument(
-        "--sync-comms",
+        "--sync-atlasbot",
        action="store_true",
-        help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
+        help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
    )
    args = ap.parse_args()
@ -632,10 +632,10 @@ def main() -> int:
    print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
-    if args.sync_comms:
+    if args.sync_atlasbot:
-        comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
+        atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
-        _sync_tree(out_dir, comms_dir)
+        _sync_tree(out_dir, atlasbot_dir)
-        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
+        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
    return 0
--- a/services/atlasbot/atlasbot-deployment.yaml
+++ b/services/atlasbot/atlasbot-deployment.yaml
@ -3,7 +3,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: atlasbot
-  namespace: comms
+  namespace: ai
  labels:
    app: atlasbot
 spec:
@ -18,7 +18,7 @@ spec:
      annotations:
        checksum/atlasbot-configmap: manual-atlasbot-101
        vault.hashicorp.com/agent-inject: "true"
-        vault.hashicorp.com/role: "comms"
+        vault.hashicorp.com/role: "ai"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
        vault.hashicorp.com/agent-inject-template-turn-secret: |
          {{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -28,6 +28,15 @@ spec:
        vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-seeder-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -58,17 +67,17 @@ spec:
        hardware: rpi5
      containers:
        - name: atlasbot
-          image: python:3.11-slim
+          image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
          command: ["/bin/sh","-c"]
          args:
            - |
-              . /vault/scripts/comms_vault_env.sh
+              . /vault/scripts/atlasbot_vault_env.sh
-              exec python /app/bot.py
+              exec python -m atlasbot.main
          env:
            - name: MATRIX_BASE
-              value: http://othrys-synapse-matrix-synapse:8008
+              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
            - name: AUTH_BASE
-              value: http://matrix-authentication-service:8080
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8080
            - name: KB_DIR
              value: /kb
            - name: VM_URL
@ -76,27 +85,69 @@ spec:
            - name: ARIADNE_STATE_URL
              value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
            - name: BOT_USER
-              value: atlasbot
+              value: atlas-smart
            - name: BOT_USER_QUICK
              value: atlas-quick
            - name: BOT_USER_SMART
              value: atlas-smart
            - name: BOT_USER_GENIUS
              value: atlas-genius
            - name: BOT_MENTIONS
-              value: atlasbot,aatlasbot,atlas_quick,atlas_smart
+              value: atlas-quick,atlas-smart,atlas-genius
            - name: OLLAMA_URL
              value: http://ollama.ai.svc.cluster.local:11434
            - name: OLLAMA_MODEL
              value: qwen2.5:14b-instruct
            - name: ATLASBOT_MODEL_FAST
              value: qwen2.5:14b-instruct-q4_0
-            - name: ATLASBOT_MODEL_DEEP
+            - name: ATLASBOT_MODEL_FAST
-              value: qwen2.5:14b-instruct
+              value: qwen2.5-coder:7b-instruct-q4_0
            - name: ATLASBOT_MODEL_SMART
              value: qwen2.5:14b-instruct-q4_0
            - name: ATLASBOT_MODEL_GENIUS
              value: qwen2.5:14b-instruct-q4_0
            - name: OLLAMA_FALLBACK_MODEL
              value: qwen2.5:14b-instruct-q4_0
            - name: OLLAMA_TIMEOUT_SEC
              value: "600"
            - name: OLLAMA_RETRIES
              value: "0"
            - name: ATLASBOT_THINKING_INTERVAL_SEC
-              value: "120"
+              value: "30"
            - name: ATLASBOT_QUICK_TIME_BUDGET_SEC
              value: "15"
            - name: ATLASBOT_SMART_TIME_BUDGET_SEC
              value: "45"
            - name: ATLASBOT_GENIUS_TIME_BUDGET_SEC
              value: "180"
            - name: ATLASBOT_SNAPSHOT_TTL_SEC
              value: "30"
            - name: ATLASBOT_HTTP_PORT
              value: "8090"
            - name: ATLASBOT_STATE_DB
              value: /data/atlasbot_state.db
            - name: ATLASBOT_QUEUE_ENABLED
              value: "false"
            - name: ATLASBOT_DEBUG_PIPELINE
              value: "true"
            - name: ATLASBOT_NATS_URL
              value: nats://nats.nats.svc.cluster.local:4222
            - name: ATLASBOT_NATS_STREAM
              value: atlasbot
            - name: ATLASBOT_NATS_SUBJECT
              value: atlasbot.requests
            - name: ATLASBOT_FAST_MAX_ANGLES
              value: "2"
            - name: ATLASBOT_SMART_MAX_ANGLES
              value: "5"
            - name: ATLASBOT_FAST_MAX_CANDIDATES
              value: "2"
            - name: ATLASBOT_SMART_MAX_CANDIDATES
              value: "6"
            - name: ATLASBOT_FAST_LLM_CALLS_MAX
              value: "8"
            - name: ATLASBOT_SMART_LLM_CALLS_MAX
              value: "24"
            - name: ATLASBOT_GENIUS_LLM_CALLS_MAX
              value: "72"
          ports:
            - name: http
              containerPort: 8090
@ -108,19 +159,15 @@ spec:
              cpu: 500m
              memory: 512Mi
          volumeMounts:
            - name: code
              mountPath: /app/bot.py
              subPath: bot.py
            - name: kb
              mountPath: /kb
              readOnly: true
            - name: vault-scripts
              mountPath: /vault/scripts
              readOnly: true
            - name: atlasbot-state
              mountPath: /data
      volumes:
        - name: code
          configMap:
            name: atlasbot
        - name: kb
          configMap:
            name: atlas-kb
@ -139,5 +186,7 @@ spec:
                path: diagrams/atlas-http.mmd
        - name: vault-scripts
          configMap:
-            name: comms-vault-env
+            name: atlasbot-vault-env
            defaultMode: 0555
        - name: atlasbot-state
          emptyDir: {}
--- a/services/atlasbot/atlasbot-rbac.yaml
+++ b/services/atlasbot/atlasbot-rbac.yaml
@ -3,7 +3,9 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: atlasbot
-  namespace: comms
+  namespace: ai
 imagePullSecrets:
  - name: harbor-regcred
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
@ -43,5 +45,4 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: atlasbot
-    namespace: comms
+    namespace: ai
--- a/services/atlasbot/atlasbot-service.yaml
+++ b/services/atlasbot/atlasbot-service.yaml
@ -2,7 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
  name: atlasbot
-  namespace: comms
+  namespace: ai
  labels:
    app: atlasbot
 spec:
--- a/services/atlasbot/image-automation.yaml
+++ b/services/atlasbot/image-automation.yaml
@ -0,0 +1,26 @@
 # services/atlasbot/image-automation.yaml
 apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
  name: atlasbot
  namespace: ai
 spec:
  interval: 1m0s
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  git:
    checkout:
      ref:
        branch: feature/atlasbot
    commit:
      author:
        name: flux-bot
        email: ops@bstein.dev
      messageTemplate: "chore(atlasbot): automated image update"
    push:
      branch: feature/atlasbot
  update:
    path: services/atlasbot
    strategy: Setters
--- a/services/atlasbot/image.yaml
+++ b/services/atlasbot/image.yaml
@ -0,0 +1,23 @@
 # services/comms/image.yaml
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
 metadata:
  name: atlasbot
  namespace: ai
 spec:
  image: registry.bstein.dev/bstein/atlasbot
  interval: 1m0s
  secretRef:
    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
 metadata:
  name: atlasbot
  namespace: ai
 spec:
  imageRepositoryRef:
    name: atlasbot
  policy:
    semver:
      range: ">=0.1.0-0"
--- a/services/atlasbot/knowledge/INDEX.md
+++ b/services/atlasbot/knowledge/INDEX.md
@ -0,0 +1,22 @@
 Atlas Knowledge Base (KB)
 This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
 - Accurate (grounded in GitOps + read-only cluster tools)
 - Maintainable (small docs + deterministic generators)
 - Safe (no secrets; refer to Secret/Vault paths by name only)
 Layout
 - `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
 - `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
 - `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
 Regeneration
 - Update manifests/docs, then regenerate generated artifacts:
  - `python scripts/knowledge_render_atlas.py --write`
 Authoring rules
 - Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
 - Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
 - Keep each runbook small; one topic per file; use headings.
 - When in doubt, link to the exact file path in this repo that configures the behavior.
--- a/services/atlasbot/knowledge/catalog/atlas-summary.json
+++ b/services/atlasbot/knowledge/catalog/atlas-summary.json
@ -0,0 +1,8 @@
 {
  "counts": {
    "helmrelease_host_hints": 19,
    "http_endpoints": 45,
    "services": 47,
    "workloads": 74
  }
 }
--- a/services/atlasbot/knowledge/catalog/atlas.json
+++ b/services/atlasbot/knowledge/catalog/atlas.json
--- a/services/atlasbot/knowledge/catalog/metrics.json
+++ b/services/atlasbot/knowledge/catalog/metrics.json
--- a/services/atlasbot/knowledge/catalog/runbooks.json
+++ b/services/atlasbot/knowledge/catalog/runbooks.json
--- a/services/atlasbot/knowledge/diagrams/atlas-http.mmd
+++ b/services/atlasbot/knowledge/diagrams/atlas-http.mmd
@ -0,0 +1,234 @@
 flowchart LR
  host_auth_bstein_dev["auth.bstein.dev"]
  svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
  host_auth_bstein_dev --> svc_sso_oauth2_proxy
  wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
  svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
  host_bstein_dev["bstein.dev"]
  svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
  wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
  svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
  svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
  host_bstein_dev --> svc_comms_matrix_wellknown
  wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
  svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
  svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
  wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
  svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
  host_budget_bstein_dev["budget.bstein.dev"]
  svc_finance_actual_budget["finance/actual-budget (Service)"]
  host_budget_bstein_dev --> svc_finance_actual_budget
  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
  svc_finance_actual_budget --> wl_finance_actual_budget
  host_call_live_bstein_dev["call.live.bstein.dev"]
  svc_comms_element_call["comms/element-call (Service)"]
  host_call_live_bstein_dev --> svc_comms_element_call
  wl_comms_element_call["comms/element-call (Deployment)"]
  svc_comms_element_call --> wl_comms_element_call
  host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
  svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
  host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
  wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
  svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
  host_ci_bstein_dev["ci.bstein.dev"]
  svc_jenkins_jenkins["jenkins/jenkins (Service)"]
  host_ci_bstein_dev --> svc_jenkins_jenkins
  wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
  svc_jenkins_jenkins --> wl_jenkins_jenkins
  host_cloud_bstein_dev["cloud.bstein.dev"]
  svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
  host_cloud_bstein_dev --> svc_nextcloud_nextcloud
  wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
  svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
  host_health_bstein_dev["health.bstein.dev"]
  svc_health_wger["health/wger (Service)"]
  host_health_bstein_dev --> svc_health_wger
  wl_health_wger["health/wger (Deployment)"]
  svc_health_wger --> wl_health_wger
  host_kit_live_bstein_dev["kit.live.bstein.dev"]
  svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
  host_kit_live_bstein_dev --> svc_comms_livekit_token_service
  wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
  svc_comms_livekit_token_service --> wl_comms_livekit_token_service
  svc_comms_livekit["comms/livekit (Service)"]
  host_kit_live_bstein_dev --> svc_comms_livekit
  wl_comms_livekit["comms/livekit (Deployment)"]
  svc_comms_livekit --> wl_comms_livekit
  host_live_bstein_dev["live.bstein.dev"]
  host_live_bstein_dev --> svc_comms_matrix_wellknown
  svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
  host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
  host_live_bstein_dev --> svc_comms_matrix_guest_register
  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
  host_live_bstein_dev --> svc_comms_matrix_authentication_service
  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
  host_logs_bstein_dev["logs.bstein.dev"]
  svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
  host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
  wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
  svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
  host_longhorn_bstein_dev["longhorn.bstein.dev"]
  svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
  host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
  wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
  svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
  host_mail_bstein_dev["mail.bstein.dev"]
  svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
  host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
  host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
  host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
  host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
  host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
  host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
  host_monero_bstein_dev["monero.bstein.dev"]
  svc_crypto_monerod["crypto/monerod (Service)"]
  host_monero_bstein_dev --> svc_crypto_monerod
  wl_crypto_monerod["crypto/monerod (Deployment)"]
  svc_crypto_monerod --> wl_crypto_monerod
  host_money_bstein_dev["money.bstein.dev"]
  svc_finance_firefly["finance/firefly (Service)"]
  host_money_bstein_dev --> svc_finance_firefly
  wl_finance_firefly["finance/firefly (Deployment)"]
  svc_finance_firefly --> wl_finance_firefly
  host_notes_bstein_dev["notes.bstein.dev"]
  svc_outline_outline["outline/outline (Service)"]
  host_notes_bstein_dev --> svc_outline_outline
  wl_outline_outline["outline/outline (Deployment)"]
  svc_outline_outline --> wl_outline_outline
  host_office_bstein_dev["office.bstein.dev"]
  svc_nextcloud_collabora["nextcloud/collabora (Service)"]
  host_office_bstein_dev --> svc_nextcloud_collabora
  wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
  svc_nextcloud_collabora --> wl_nextcloud_collabora
  host_pegasus_bstein_dev["pegasus.bstein.dev"]
  svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
  host_pegasus_bstein_dev --> svc_jellyfin_pegasus
  wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
  svc_jellyfin_pegasus --> wl_jellyfin_pegasus
  host_scm_bstein_dev["scm.bstein.dev"]
  svc_gitea_gitea["gitea/gitea (Service)"]
  host_scm_bstein_dev --> svc_gitea_gitea
  wl_gitea_gitea["gitea/gitea (Deployment)"]
  svc_gitea_gitea --> wl_gitea_gitea
  host_secret_bstein_dev["secret.bstein.dev"]
  svc_vault_vault["vault/vault (Service)"]
  host_secret_bstein_dev --> svc_vault_vault
  wl_vault_vault["vault/vault (StatefulSet)"]
  svc_vault_vault --> wl_vault_vault
  host_sso_bstein_dev["sso.bstein.dev"]
  svc_sso_keycloak["sso/keycloak (Service)"]
  host_sso_bstein_dev --> svc_sso_keycloak
  wl_sso_keycloak["sso/keycloak (Deployment)"]
  svc_sso_keycloak --> wl_sso_keycloak
  host_stream_bstein_dev["stream.bstein.dev"]
  svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
  host_stream_bstein_dev --> svc_jellyfin_jellyfin
  wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
  svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
  host_tasks_bstein_dev["tasks.bstein.dev"]
  svc_planka_planka["planka/planka (Service)"]
  host_tasks_bstein_dev --> svc_planka_planka
  wl_planka_planka["planka/planka (Deployment)"]
  svc_planka_planka --> wl_planka_planka
  host_vault_bstein_dev["vault.bstein.dev"]
  svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
  host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
  wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
  svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
  subgraph bstein_dev_home[bstein-dev-home]
    svc_bstein_dev_home_bstein_dev_home_frontend
    wl_bstein_dev_home_bstein_dev_home_frontend
    svc_bstein_dev_home_bstein_dev_home_backend
    wl_bstein_dev_home_bstein_dev_home_backend
    svc_bstein_dev_home_chat_ai_gateway
    wl_bstein_dev_home_chat_ai_gateway
  end
  subgraph comms[comms]
    svc_comms_matrix_wellknown
    wl_comms_matrix_wellknown
    svc_comms_element_call
    wl_comms_element_call
    svc_comms_livekit_token_service
    wl_comms_livekit_token_service
    svc_comms_livekit
    wl_comms_livekit
    svc_comms_othrys_synapse_matrix_synapse
    svc_comms_matrix_guest_register
    wl_comms_matrix_guest_register
    svc_comms_matrix_authentication_service
    wl_comms_matrix_authentication_service
  end
  subgraph crypto[crypto]
    svc_crypto_monerod
    wl_crypto_monerod
  end
  subgraph finance[finance]
    svc_finance_actual_budget
    wl_finance_actual_budget
    svc_finance_firefly
    wl_finance_firefly
  end
  subgraph gitea[gitea]
    svc_gitea_gitea
    wl_gitea_gitea
  end
  subgraph health[health]
    svc_health_wger
    wl_health_wger
  end
  subgraph jellyfin[jellyfin]
    svc_jellyfin_pegasus
    wl_jellyfin_pegasus
    svc_jellyfin_jellyfin
    wl_jellyfin_jellyfin
  end
  subgraph jenkins[jenkins]
    svc_jenkins_jenkins
    wl_jenkins_jenkins
  end
  subgraph logging[logging]
    svc_logging_oauth2_proxy_logs
    wl_logging_oauth2_proxy_logs
  end
  subgraph longhorn_system[longhorn-system]
    svc_longhorn_system_oauth2_proxy_longhorn
    wl_longhorn_system_oauth2_proxy_longhorn
  end
  subgraph mailu_mailserver[mailu-mailserver]
    svc_mailu_mailserver_mailu_front
  end
  subgraph nextcloud[nextcloud]
    svc_nextcloud_nextcloud
    wl_nextcloud_nextcloud
    svc_nextcloud_collabora
    wl_nextcloud_collabora
  end
  subgraph outline[outline]
    svc_outline_outline
    wl_outline_outline
  end
  subgraph planka[planka]
    svc_planka_planka
    wl_planka_planka
  end
  subgraph sso[sso]
    svc_sso_oauth2_proxy
    wl_sso_oauth2_proxy
    svc_sso_keycloak
    wl_sso_keycloak
  end
  subgraph vault[vault]
    svc_vault_vault
    wl_vault_vault
  end
  subgraph vaultwarden[vaultwarden]
    svc_vaultwarden_vaultwarden_service
    wl_vaultwarden_vaultwarden
  end
--- a/services/atlasbot/kustomization.yaml
+++ b/services/atlasbot/kustomization.yaml
@ -0,0 +1,29 @@
 # services/atlasbot/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: ai
 resources:
  - atlasbot-deployment.yaml
  - atlasbot-service.yaml
  - atlasbot-rbac.yaml
  - secretproviderclass.yaml
  - vault-sync-deployment.yaml
  - image.yaml
  - image-automation.yaml
 images:
  - name: registry.bstein.dev/bstein/atlasbot
    newTag: 0.1.2-106 # {"$imagepolicy": "ai:atlasbot:tag"}
 configMapGenerator:
  - name: atlasbot-vault-env
    files:
      - atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
    options:
      disableNameSuffixHash: true
  - name: atlas-kb
    files:
      - INDEX.md=knowledge/INDEX.md
      - atlas.json=knowledge/catalog/atlas.json
      - atlas-summary.json=knowledge/catalog/atlas-summary.json
      - metrics.json=knowledge/catalog/metrics.json
      - runbooks.json=knowledge/catalog/runbooks.json
      - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
--- a/services/atlasbot/scripts/atlasbot_vault_env.sh
+++ b/services/atlasbot/scripts/atlasbot_vault_env.sh
@ -0,0 +1,44 @@
 #!/usr/bin/env sh
 set -eu
 vault_dir="/vault/secrets"
 read_secret() {
  tr -d '\r\n' < "${vault_dir}/$1"
 }
 read_optional() {
  if [ -f "${vault_dir}/$1" ]; then
    tr -d '\r\n' < "${vault_dir}/$1"
  else
    printf ''
  fi
 }
 export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
 export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
 export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
 export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
 export BOT_PASS="$(read_secret bot-pass)"
 export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
 export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
 export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
 if [ -z "${BOT_PASS_SMART}" ]; then
  export BOT_PASS_SMART="${BOT_PASS}"
 fi
 if [ -z "${BOT_PASS_GENIUS}" ]; then
  export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
 fi
 export SEEDER_PASS="$(read_secret seeder-pass)"
 export CHAT_API_KEY="$(read_secret chat-matrix)"
 export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
 export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
 export PGPASSWORD="$(read_secret synapse-db-pass)"
 export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
 export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
 export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"
--- a/services/atlasbot/secretproviderclass.yaml
+++ b/services/atlasbot/secretproviderclass.yaml
@ -0,0 +1,21 @@
 # services/atlasbot/secretproviderclass.yaml
 apiVersion: secrets-store.csi.x-k8s.io/v1
 kind: SecretProviderClass
 metadata:
  name: atlasbot-vault
  namespace: ai
 spec:
  provider: vault
  parameters:
    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
    roleName: "ai"
    objects: |
      - objectName: "harbor-pull__dockerconfigjson"
        secretPath: "kv/data/atlas/shared/harbor-pull"
        secretKey: "dockerconfigjson"
  secretObjects:
    - secretName: harbor-regcred
      type: kubernetes.io/dockerconfigjson
      data:
        - objectName: harbor-pull__dockerconfigjson
          key: .dockerconfigjson
--- a/services/atlasbot/vault-sync-deployment.yaml
+++ b/services/atlasbot/vault-sync-deployment.yaml
@ -0,0 +1,34 @@
 # services/atlasbot/vault-sync-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: atlasbot-vault-sync
  namespace: ai
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: atlasbot-vault-sync
  template:
    metadata:
      labels:
        app: atlasbot-vault-sync
    spec:
      serviceAccountName: atlasbot
      containers:
        - name: sync
          image: alpine:3.20
          command: ["/bin/sh", "-c"]
          args:
            - "sleep infinity"
          volumeMounts:
            - name: vault-secrets
              mountPath: /vault/secrets
              readOnly: true
      volumes:
        - name: vault-secrets
          csi:
            driver: secrets-store.csi.k8s.io
            readOnly: true
            volumeAttributes:
              secretProviderClass: atlasbot-vault
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -68,7 +68,13 @@ spec:
            - name: AI_CHAT_TIMEOUT_SEC
              value: "480"
            - name: AI_ATLASBOT_ENDPOINT
-              value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
+              value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
            - name: AI_ATLASBOT_MODEL_FAST
              value: qwen2.5-coder:7b-instruct-q4_0
            - name: AI_ATLASBOT_MODEL_SMART
              value: qwen2.5:14b-instruct
            - name: AI_ATLASBOT_MODEL_GENIUS
              value: qwen2.5:14b-instruct
            - name: AI_ATLASBOT_TIMEOUT_SEC
              value: "30"
            - name: AI_NODE_NAME
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@ -20,9 +20,9 @@ resources:
  - ingress.yaml
 images:
  - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
  - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
  - name: chat-ai-gateway
    namespace: bstein-dev-home
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@ -13,10 +13,7 @@ resources:
  - element-call-deployment.yaml
  - guest-register-deployment.yaml
  - guest-register-service.yaml
  - atlasbot-deployment.yaml
  - atlasbot-service.yaml
  - wellknown.yaml
  - atlasbot-rbac.yaml
  - mas-secrets-ensure-rbac.yaml
  - comms-secrets-ensure-rbac.yaml
  - mas-db-ensure-rbac.yaml
@ -43,7 +40,6 @@ resources:
  - livekit-ingress.yaml
  - livekit-middlewares.yaml
  - matrix-ingress.yaml
 configMapGenerator:
  - name: comms-vault-env
    files:
@ -60,21 +56,8 @@ configMapGenerator:
      - server.py=scripts/guest-register/server.py
    options:
      disableNameSuffixHash: true
  - name: atlasbot
    files:
      - bot.py=scripts/atlasbot/bot.py
    options:
      disableNameSuffixHash: true
  - name: othrys-element-host-config
    files:
      - 20-host-config.sh=scripts/element-host-config.sh
    options:
      disableNameSuffixHash: true
  - name: atlas-kb
    files:
      - INDEX.md=knowledge/INDEX.md
      - atlas.json=knowledge/catalog/atlas.json
      - atlas-summary.json=knowledge/catalog/atlas-summary.json
      - metrics.json=knowledge/catalog/metrics.json
      - runbooks.json=knowledge/catalog/runbooks.json
      - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
--- a/services/comms/matrix-ingress.yaml
+++ b/services/comms/matrix-ingress.yaml
@ -7,6 +7,7 @@ metadata:
    kubernetes.io/ingress.class: traefik
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.tls: "true"
    traefik.ingress.kubernetes.io/router.priority: "120"
    cert-manager.io/cluster-issuer: letsencrypt
 spec:
  ingressClassName: traefik
@ -43,6 +44,13 @@ spec:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /_matrix/client/r0/login
            pathType: Prefix
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /_matrix/client/v3/logout
            pathType: Exact
            backend:
@ -57,6 +65,41 @@ spec:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /account
            pathType: Prefix
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /authorize
            pathType: Prefix
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /oauth2
            pathType: Prefix
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /.well-known/openid-configuration
            pathType: Exact
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /.well-known/oauth-authorization-server
            pathType: Exact
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /_matrix
            pathType: Prefix
            backend:
@ -102,6 +145,13 @@ spec:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /_matrix/client/r0/login
            pathType: Prefix
            backend:
              service:
                name: matrix-authentication-service
                port:
                  number: 8080
          - path: /_matrix/client/v3/logout
            pathType: Exact
            backend:
--- a/services/comms/oneoffs/comms-secrets-ensure-job.yaml
+++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
@ -1,12 +1,12 @@
 # services/comms/oneoffs/comms-secrets-ensure-job.yaml
-# One-off job for comms/comms-secrets-ensure-7.
+# One-off job for comms/comms-secrets-ensure-8.
-# Purpose: comms secrets ensure 7 (see container args/env in this file).
+# Purpose: comms secrets ensure 8 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: comms-secrets-ensure-7
+  name: comms-secrets-ensure-8
  namespace: comms
 spec:
  suspend: true
@ -87,6 +87,9 @@ spec:
              ensure_key "comms/synapse-redis" "redis-password" >/dev/null
              ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null
              SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"
--- a/services/comms/oneoffs/mas-local-users-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
@ -1,12 +1,12 @@
 # services/comms/oneoffs/mas-local-users-ensure-job.yaml
-# One-off job for comms/mas-local-users-ensure-18.
+# One-off job for comms/mas-local-users-ensure-19.
 # Purpose: mas local users ensure 18 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-18
+  name: mas-local-users-ensure-19
  namespace: comms
 spec:
  suspend: true
@ -27,6 +27,12 @@ spec:
        vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-seeder-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -92,7 +98,13 @@ spec:
            - name: SEEDER_USER
              value: othrys-seeder
            - name: BOT_USER
-              value: atlasbot
+              value: atlas-smart
            - name: BOT_USER_QUICK
              value: atlas-quick
            - name: BOT_USER_SMART
              value: atlas-smart
            - name: BOT_USER_GENIUS
              value: atlas-genius
          command:
            - /bin/sh
            - -c
@ -225,11 +237,27 @@ spec:
                      },
                      timeout=30,
                  )
                  if r.status_code == 429:
                      return False
                  if r.status_code != 200:
                      raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
                  return True
              wait_for_service(MAS_ADMIN_API_BASE)
              token = admin_token()
              bot_quick = os.environ.get("BOT_USER_QUICK", "")
              bot_smart = os.environ.get("BOT_USER_SMART", "")
              bot_genius = os.environ.get("BOT_USER_GENIUS", "")
              bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
              bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
              bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
              ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
              ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
              if bot_quick and bot_quick_pass:
                  ensure_user(token, bot_quick, bot_quick_pass)
              if bot_smart and bot_smart_pass:
                  ensure_user(token, bot_smart, bot_smart_pass)
              if bot_genius and bot_genius_pass:
                  ensure_user(token, bot_genius, bot_genius_pass)
              PY
--- a/services/comms/oneoffs/synapse-admin-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
@ -1,15 +1,15 @@
 # services/comms/oneoffs/synapse-admin-ensure-job.yaml
-# One-off job for comms/synapse-admin-ensure-3.
+# One-off job for comms/synapse-admin-ensure-15.
-# Purpose: synapse admin ensure 3 (see container args/env in this file).
+# Purpose: synapse admin ensure 15 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-admin-ensure-3
+  name: synapse-admin-ensure-15
  namespace: comms
 spec:
-  suspend: true
+  suspend: false
  backoffLimit: 0
  ttlSecondsAfterFinished: 3600
  template:
@ -32,7 +32,8 @@ spec:
                    values: ["arm64"]
      containers:
        - name: ensure
-          image: python:3.11-slim
+          image: python:3.12-slim
          imagePullPolicy: Always
          env:
            - name: VAULT_ADDR
              value: http://vault.vault.svc.cluster.local:8200
@ -45,22 +46,20 @@ spec:
            - -c
            - |
              set -euo pipefail
-              pip install --no-cache-dir psycopg2-binary bcrypt
+              python -m pip install --no-cache-dir psycopg2-binary
              python - <<'PY'
              import json
              import os
              import secrets
              import string
              import time
              import urllib.error
              import urllib.parse
              import urllib.request
              import bcrypt
              import psycopg2
              VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
              VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
              SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
              SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
              PGHOST = "postgres-service.postgres.svc.cluster.local"
              PGPORT = 5432
              PGDATABASE = "synapse"
@ -113,48 +112,15 @@ spec:
                  with urllib.request.urlopen(req, timeout=30) as resp:
                      resp.read()
              def random_password(length: int = 32) -> str:
                  alphabet = string.ascii_letters + string.digits
                  return "".join(secrets.choice(alphabet) for _ in range(length))
              def ensure_admin_creds(token: str) -> dict:
                  data = vault_get(token, "comms/synapse-admin")
-                  username = (data.get("username") or "").strip() or "synapse-admin"
+                  username = "othrys-seeder"
-                  password = (data.get("password") or "").strip()
+                  if data.get("username") != username:
                  if not password:
                      password = random_password()
                      data["username"] = username
-                  data["password"] = password
+                      data.pop("access_token", None)
                  vault_put(token, "comms/synapse-admin", data)
                  return data
              def ensure_user(cur, cols, user_id, password, admin):
                  now_ms = int(time.time() * 1000)
                  values = {
                      "name": user_id,
                      "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
                      "creation_ts": now_ms,
                  }
                  def add_flag(name, flag):
                      if name not in cols:
                          return
                      if cols[name]["type"] in ("smallint", "integer"):
                          values[name] = int(flag)
                      else:
                          values[name] = bool(flag)
                  add_flag("admin", admin)
                  add_flag("deactivated", False)
                  add_flag("shadow_banned", False)
                  add_flag("is_guest", False)
                  columns = list(values.keys())
                  placeholders = ", ".join(["%s"] * len(columns))
                  updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
                  query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
                  cur.execute(query, [values[c] for c in columns])
              def get_cols(cur):
                  cur.execute(
                      """
@ -172,30 +138,40 @@ spec:
                      }
                  return cols
-              def ensure_access_token(cur, user_id, token_value):
+              def admin_token_valid(token: str, user_id: str) -> bool:
-                  cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
+                  if not token or not SYNAPSE_ADMIN_URL:
-                  token_id = cur.fetchone()[0]
+                      return False
-                  cur.execute(
+                  encoded = urllib.parse.quote(user_id, safe="")
-                      """
+                  url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
-                      INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
+                  req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
-                      VALUES (%s, %s, %s, %s, NULL)
+                  try:
-                      ON CONFLICT (token) DO NOTHING
+                      with urllib.request.urlopen(req, timeout=30) as resp:
-                      """,
+                          resp.read()
-                      (token_id, user_id, token_value, "ariadne-admin"),
+                      return True
-                  )
+                  except urllib.error.HTTPError as exc:
                      if exc.code == 404:
                          return True
                      if exc.code in (401, 403):
                          return False
                      raise
              vault_token = vault_login()
              admin_data = ensure_admin_creds(vault_token)
-              if admin_data.get("access_token"):
+              user_id = f"@{admin_data['username']}:live.bstein.dev"
-                  log("synapse admin token already present")
+              existing_token = admin_data.get("access_token")
              if existing_token and admin_token_valid(existing_token, user_id):
                  log("synapse admin token already present and valid")
                  raise SystemExit(0)
              if existing_token:
                  log("synapse admin token invalid; rotating")
                  admin_data.pop("access_token", None)
                  vault_put(vault_token, "comms/synapse-admin", admin_data)
              synapse_db = vault_get(vault_token, "comms/synapse-db")
              pg_password = synapse_db.get("POSTGRES_PASSWORD")
              if not pg_password:
                  raise RuntimeError("synapse db password missing")
              user_id = f"@{admin_data['username']}:live.bstein.dev"
              conn = psycopg2.connect(
                  host=PGHOST,
                  port=PGPORT,
@ -203,17 +179,34 @@ spec:
                  user=PGUSER,
                  password=pg_password,
              )
              token_value = secrets.token_urlsafe(32)
              try:
                  with conn:
                      with conn.cursor() as cur:
                          cols = get_cols(cur)
-                          ensure_user(cur, cols, user_id, admin_data["password"], True)
+                          if "admin" not in cols:
-                          ensure_access_token(cur, user_id, token_value)
+                              raise RuntimeError("users.admin column missing")
                          cur.execute(
                              "UPDATE users SET admin = TRUE WHERE name = %s",
                              (user_id,),
                          )
                          cur.execute(
                              """
                              SELECT token FROM access_tokens
                              WHERE user_id = %s AND valid_until_ms IS NULL
                              ORDER BY id DESC LIMIT 1
                              """,
                              (user_id,),
                          )
                          row = cur.fetchone()
                          if not row:
                              raise RuntimeError(f"no access token found for {user_id}")
                          token_value = row[0]
              finally:
                  conn.close()
              admin_data["access_token"] = token_value
              vault_put(vault_token, "comms/synapse-admin", admin_data)
              if not admin_token_valid(token_value, user_id):
                  raise RuntimeError("synapse admin token validation failed")
              log("synapse admin token stored")
              PY
--- a/services/comms/oneoffs/synapse-user-seed-job.yaml
+++ b/services/comms/oneoffs/synapse-user-seed-job.yaml
@ -82,8 +82,6 @@ spec:
              value: synapse
            - name: SEEDER_USER
              value: othrys-seeder
            - name: BOT_USER
              value: atlasbot
          command:
            - /bin/sh
            - -c
@ -141,10 +139,8 @@ spec:
                  cur.execute(query, [values[c] for c in columns])
              seeder_user = os.environ["SEEDER_USER"]
              bot_user = os.environ["BOT_USER"]
              server = "live.bstein.dev"
              seeder_id = f"@{seeder_user}:{server}"
              bot_id = f"@{bot_user}:{server}"
              conn = psycopg2.connect(
                  host=os.environ["PGHOST"],
@ -158,7 +154,6 @@ spec:
                      with conn.cursor() as cur:
                          cols = get_cols(cur)
                          upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
                          upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
              finally:
                  conn.close()
              PY
--- a/services/comms/reset-othrys-room-job.yaml
+++ b/services/comms/reset-othrys-room-job.yaml
@ -76,7 +76,7 @@ spec:
                - name: SEEDER_USER
                  value: othrys-seeder
                - name: BOT_USER
-                  value: atlasbot
+                  value: atlas-smart
              command:
                - /bin/sh
                - -c
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -11,14 +11,21 @@ from urllib import error, parse, request
 BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
 AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
-USER = os.environ["BOT_USER"]
+BOT_USER = os.environ["BOT_USER"]
-PASSWORD = os.environ["BOT_PASS"]
+BOT_PASS = os.environ["BOT_PASS"]
 BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
 BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
 BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
 BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
 BOT_USER_GENIUS = os.environ.get("BOT_USER_GENIUS", "").strip()
 BOT_PASS_GENIUS = os.environ.get("BOT_PASS_GENIUS", "").strip()
 ROOM_ALIAS = "#othrys:live.bstein.dev"
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
 MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
-MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
+MODEL_SMART = os.environ.get("ATLASBOT_MODEL_SMART", os.environ.get("ATLASBOT_MODEL_DEEP", "")).strip()
 MODEL_GENIUS = os.environ.get("ATLASBOT_MODEL_GENIUS", MODEL_SMART).strip()
 FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
 OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
@ -31,7 +38,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
 ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
 ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")
-BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
+BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
 SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -39,6 +46,9 @@ MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
 MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
 MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
 THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
 QUICK_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_QUICK_TIME_BUDGET_SEC", "15"))
 SMART_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_SMART_TIME_BUDGET_SEC", "45"))
 GENIUS_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_GENIUS_TIME_BUDGET_SEC", "180"))
 OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
 OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
@ -380,27 +390,104 @@ def _strip_bot_mention(text: str) -> str:
    return cleaned or text.strip()
-def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
+def _detect_mode_from_body(body: str, *, default: str = "smart") -> str:
    lower = normalize_query(body or "")
    if "atlas_quick" in lower or "atlas-quick" in lower:
        return "fast"
    if "atlas_smart" in lower or "atlas-smart" in lower:
-        return "deep"
+        return "smart"
    if "atlas_genius" in lower or "atlas-genius" in lower:
        return "genius"
    if lower.startswith("quick ") or lower.startswith("fast "):
        return "fast"
-    if lower.startswith("smart ") or lower.startswith("deep "):
+    if lower.startswith("smart "):
-        return "deep"
+        return "smart"
    if lower.startswith("genius ") or lower.startswith("deep "):
        return "genius"
    return default
 def _detect_mode(
    content: dict[str, Any],
    body: str,
    *,
    default: str = "smart",
    account_user: str = "",
 ) -> str:
    mode = _detect_mode_from_body(body, default=default)
    mentions = content.get("m.mentions", {})
    user_ids = mentions.get("user_ids", [])
    if isinstance(user_ids, list):
        normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
        if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
            return "fast"
        if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
            return "smart"
        if BOT_USER_GENIUS and normalize_user_id(BOT_USER_GENIUS).lower() in normalized:
            return "genius"
        if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
            return "smart"
    if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
        return "fast"
    if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
        return "smart"
    if account_user and BOT_USER_GENIUS and normalize_user_id(account_user) == normalize_user_id(BOT_USER_GENIUS):
        return "genius"
    return mode
 def _model_for_mode(mode: str) -> str:
    if mode == "fast" and MODEL_FAST:
        return MODEL_FAST
-    if mode == "deep" and MODEL_DEEP:
+    if mode == "smart" and MODEL_SMART:
-        return MODEL_DEEP
+        return MODEL_SMART
    if mode == "genius" and MODEL_GENIUS:
        return MODEL_GENIUS
    if mode == "deep" and MODEL_SMART:
        return MODEL_SMART
    return MODEL
 def _normalize_mode(mode: str) -> str:
    normalized = (mode or "").strip().lower()
    if normalized in {"quick", "fast"}:
        return "fast"
    if normalized in {"smart"}:
        return "smart"
    if normalized in {"genius", "deep"}:
        return "genius"
    return "smart"
 def _mode_time_budget_sec(mode: str) -> float:
    normalized = _normalize_mode(mode)
    if normalized == "fast":
        return max(1.0, QUICK_TIME_BUDGET_SEC)
    if normalized == "smart":
        return max(1.0, SMART_TIME_BUDGET_SEC)
    if normalized == "genius":
        return max(1.0, GENIUS_TIME_BUDGET_SEC)
    return max(1.0, SMART_TIME_BUDGET_SEC)
 def _mode_ollama_timeout_sec(mode: str) -> float:
    normalized = _normalize_mode(mode)
    budget = _mode_time_budget_sec(normalized)
    if normalized == "fast":
        return max(6.0, min(budget - 2.0, OLLAMA_TIMEOUT_SEC))
    if normalized == "smart":
        return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
    if normalized == "genius":
        return max(20.0, min(budget - 10.0, OLLAMA_TIMEOUT_SEC))
    return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
 def _mode_heartbeat_sec(mode: str) -> int:
    normalized = _normalize_mode(mode)
    budget = _mode_time_budget_sec(normalized)
    return max(5, min(THINKING_INTERVAL_SEC, int(max(5.0, budget / 3.0))))
 # Matrix HTTP helper.
 def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
    url = (base or BASE) + path
@ -416,12 +503,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
        raw = resp.read()
        return json.loads(raw.decode()) if raw else {}
-def login() -> str:
+def login(user: str, password: str) -> str:
-    login_user = normalize_user_id(USER)
+    login_user = normalize_user_id(user)
    payload = {
        "type": "m.login.password",
        "identifier": {"type": "m.id.user", "user": login_user},
-        "password": PASSWORD,
+        "password": password,
    }
    res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
    return res["access_token"]
@ -2628,6 +2715,11 @@ def _append_history_context(context: str, history_lines: list[str]) -> str:
    return combined
 def _merge_context_blocks(*blocks: str) -> str:
    parts = [block.strip() for block in blocks if isinstance(block, str) and block.strip()]
    return "\n\n".join(parts)
 class ThoughtState:
    def __init__(self, total_steps: int = 0):
        self._lock = threading.Lock()
@ -2985,6 +3077,7 @@ def _ollama_call_safe(
    fallback: str,
    system_override: str | None = None,
    model: str | None = None,
    timeout: float | None = None,
 ) -> str:
    try:
        return _ollama_call(
@ -2994,6 +3087,7 @@ def _ollama_call_safe(
            use_history=False,
            system_override=system_override,
            model=model,
            timeout=timeout,
        )
    except Exception:
        return fallback
@ -3813,9 +3907,12 @@ def _open_ended_multi(
 def _open_ended_total_steps(mode: str) -> int:
-    if mode == "fast":
+    normalized = _normalize_mode(mode)
    if normalized == "fast":
        return 2
-    return 9
+    if normalized == "smart":
        return 3
    return 4
 def _fast_fact_lines(
@ -4136,6 +4233,7 @@ def _open_ended_fast_single(
    prompt: str,
    *,
    context: str,
    fallback_context: str | None = None,
    history_lines: list[str] | None = None,
    state: ThoughtState | None = None,
    model: str,
@ -4143,24 +4241,26 @@ def _open_ended_fast_single(
    if state:
        state.update("drafting", step=1, note="summarizing")
    working_context = _append_history_context(context, history_lines or []) if history_lines else context
-    reply = _ollama_call(
+    reply = _ollama_call_safe(
        ("atlasbot_fast", "atlasbot_fast"),
        prompt,
        context=working_context,
-        use_history=False,
+        fallback="",
        system_override=_open_ended_system(),
        model=model,
        timeout=_mode_ollama_timeout_sec("fast"),
    )
    if not _has_body_lines(reply):
-        reply = _ollama_call(
+        reply = _ollama_call_safe(
            ("atlasbot_fast", "atlasbot_fast"),
            prompt + " Provide one clear sentence before the score lines.",
            context=working_context,
-            use_history=False,
+            fallback="",
            system_override=_open_ended_system(),
            model=model,
            timeout=_mode_ollama_timeout_sec("fast"),
        )
-    fallback = _fallback_fact_answer(prompt, context)
+    fallback = _fallback_fact_answer(prompt, fallback_context or context)
    if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
        reply = fallback
    if not _has_body_lines(reply):
@ -4177,6 +4277,7 @@ def _open_ended_fast(
    fact_lines: list[str],
    fact_meta: dict[str, dict[str, Any]],
    history_lines: list[str],
    extra_context: str = "",
    state: ThoughtState | None = None,
 ) -> str:
    model = _model_for_mode("fast")
@ -4197,6 +4298,7 @@ def _open_ended_fast(
    selected_pack = _fact_pack_text(selected_lines, selected_meta)
    if _needs_full_fact_pack(prompt) or not selected_lines:
        selected_pack = fact_pack
    model_context = _merge_context_blocks(selected_pack, extra_context)
    if not subjective and _needs_full_fact_pack(prompt):
        fallback = _fallback_fact_answer(prompt, fact_pack)
        if fallback:
@ -4205,7 +4307,8 @@ def _open_ended_fast(
        state.total_steps = _open_ended_total_steps("fast")
    return _open_ended_fast_single(
        prompt,
-        context=selected_pack,
+        context=model_context,
        fallback_context=selected_pack,
        history_lines=history_lines,
        state=state,
        model=model,
@ -4219,16 +4322,55 @@ def _open_ended_deep(
    fact_lines: list[str],
    fact_meta: dict[str, dict[str, Any]],
    history_lines: list[str],
    mode: str,
    extra_context: str = "",
    state: ThoughtState | None = None,
 ) -> str:
-    return _open_ended_multi(
+    normalized = _normalize_mode(mode)
-        prompt,
+    model = _model_for_mode(normalized)
-        fact_pack=fact_pack,
+    subjective = _is_subjective_query(prompt)
-        fact_lines=fact_lines,
+    primary_tags = _primary_tags_for_prompt(prompt)
-        fact_meta=fact_meta,
+    focus_tags = _preferred_tags_for_prompt(prompt)
-        history_lines=history_lines,
+    if not focus_tags and subjective:
-        state=state,
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
    avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
    limit = 12 if normalized == "smart" else 18
    selected_lines = _fast_fact_lines(
        fact_lines,
        fact_meta,
        focus_tags=focus_tags,
        avoid_tags=avoid_tags,
        primary_tags=primary_tags,
        limit=limit,
    )
    selected_meta = _fact_pack_meta(selected_lines)
    selected_pack = _fact_pack_text(selected_lines, selected_meta)
    if _needs_full_fact_pack(prompt) or not selected_lines or normalized == "genius":
        selected_pack = fact_pack
    fallback = _fallback_fact_answer(prompt, selected_pack)
    model_context = _merge_context_blocks(selected_pack, extra_context)
    if not subjective and fallback:
        if state:
            state.update("done", step=_open_ended_total_steps(normalized))
        return _ensure_scores(fallback)
    if state:
        state.update("drafting", step=1, note="synthesizing")
    reply = _ollama_call_safe(
        ("atlasbot_deep", "atlasbot_deep"),
        prompt,
        context=_append_history_context(model_context, history_lines),
        fallback="",
        system_override=_open_ended_system(),
        model=model,
        timeout=_mode_ollama_timeout_sec(normalized),
    )
    if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
        reply = fallback
    if not _has_body_lines(reply):
        reply = "I don't have enough data in the current snapshot to answer that."
    if state:
        state.update("done", step=_open_ended_total_steps(normalized))
    return _ensure_scores(reply)
 def open_ended_answer(
@ -4240,6 +4382,7 @@ def open_ended_answer(
    history_lines: list[str],
    mode: str,
    allow_tools: bool,
    context: str = "",
    state: ThoughtState | None = None,
 ) -> str:
    lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
@ -4256,13 +4399,15 @@ def open_ended_answer(
        return _ensure_scores("I don't have enough data to answer that.")
    fact_meta = _fact_pack_meta(lines)
    fact_pack = _fact_pack_text(lines, fact_meta)
-    if mode == "fast":
+    normalized = _normalize_mode(mode)
    if normalized == "fast":
        return _open_ended_fast(
            prompt,
            fact_pack=fact_pack,
            fact_lines=lines,
            fact_meta=fact_meta,
            history_lines=history_lines,
            extra_context=context,
            state=state,
        )
    return _open_ended_deep(
@ -4271,6 +4416,8 @@ def open_ended_answer(
        fact_lines=lines,
        fact_meta=fact_meta,
        history_lines=history_lines,
        extra_context=context,
        mode=normalized,
        state=state,
    )
@ -4292,6 +4439,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
        use_history=False,
        system_override=system,
        model=model,
        timeout=_mode_ollama_timeout_sec(mode),
    )
    reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
    return _ensure_scores(reply)
@ -4343,13 +4491,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
            self._write_json(400, {"error": "missing_prompt"})
            return
        cleaned = _strip_bot_mention(prompt)
-        mode = str(payload.get("mode") or "deep").lower()
+        mode = _normalize_mode(str(payload.get("mode") or "smart"))
        if mode in ("quick", "fast"):
            mode = "fast"
        elif mode in ("smart", "deep"):
            mode = "deep"
        else:
            mode = "deep"
        snapshot = _snapshot_state()
        inventory = _snapshot_inventory(snapshot) or node_inventory_live()
        workloads = _snapshot_workloads(snapshot)
@ -4386,6 +4528,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                history_lines=history_lines,
                mode=mode,
                allow_tools=True,
                context=context,
                state=None,
            )
        else:
@ -4640,6 +4783,7 @@ def _ollama_call(
    use_history: bool = True,
    system_override: str | None = None,
    model: str | None = None,
    timeout: float | None = None,
 ) -> str:
    system = system_override or (
        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
@ -4673,6 +4817,7 @@ def _ollama_call(
    messages.append({"role": "user", "content": prompt})
    model_name = model or MODEL
    request_timeout = timeout if timeout is not None else OLLAMA_TIMEOUT_SEC
    payload = {"model": model_name, "messages": messages, "stream": False}
    headers = {"Content-Type": "application/json"}
    if API_KEY:
@ -4683,13 +4828,13 @@ def _ollama_call(
        lock.acquire()
    try:
        try:
-            with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+            with request.urlopen(r, timeout=request_timeout) as resp:
                data = json.loads(resp.read().decode())
        except error.HTTPError as exc:
            if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
                payload["model"] = FALLBACK_MODEL
                r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
-                with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+                with request.urlopen(r, timeout=request_timeout) as resp:
                    data = json.loads(resp.read().decode())
            else:
                raise
@ -4714,6 +4859,7 @@ def ollama_reply(
    fallback: str = "",
    use_history: bool = True,
    model: str | None = None,
    timeout: float | None = None,
 ) -> str:
    last_error = None
    for attempt in range(max(1, OLLAMA_RETRIES + 1)):
@ -4724,6 +4870,7 @@ def ollama_reply(
                context=context,
                use_history=use_history,
                model=model,
                timeout=timeout,
            )
        except Exception as exc:  # noqa: BLE001
            last_error = exc
@ -4744,11 +4891,13 @@ def ollama_reply_with_thinking(
    fallback: str,
    use_history: bool = True,
    model: str | None = None,
    timeout: float | None = None,
 ) -> str:
    result: dict[str, str] = {"reply": ""}
    done = threading.Event()
    def worker():
        try:
            result["reply"] = ollama_reply(
                hist_key,
                prompt,
@ -4756,7 +4905,9 @@ def ollama_reply_with_thinking(
                fallback=fallback,
                use_history=use_history,
                model=model,
                timeout=timeout,
            )
        finally:
            done.set()
    thread = threading.Thread(target=worker, daemon=True)
@ -4789,6 +4940,7 @@ def open_ended_with_thinking(
    history_lines: list[str],
    mode: str,
    allow_tools: bool,
    context: str = "",
 ) -> str:
    result: dict[str, str] = {"reply": ""}
    done = threading.Event()
@ -4796,6 +4948,7 @@ def open_ended_with_thinking(
    state = ThoughtState(total_steps=total_steps)
    def worker():
        try:
            result["reply"] = open_ended_answer(
                prompt,
                inventory=inventory,
@ -4804,15 +4957,17 @@ def open_ended_with_thinking(
                history_lines=history_lines,
                mode=mode,
                allow_tools=allow_tools,
                context=context,
                state=state,
            )
        finally:
            done.set()
    thread = threading.Thread(target=worker, daemon=True)
    thread.start()
    if not done.wait(2.0):
        send_msg(token, room, "Thinking…")
-        heartbeat = max(10, THINKING_INTERVAL_SEC)
+        heartbeat = _mode_heartbeat_sec(mode)
        next_heartbeat = time.monotonic() + heartbeat
        while not done.wait(max(0, next_heartbeat - time.monotonic())):
            send_msg(token, room, state.status_line())
@ -4820,7 +4975,7 @@ def open_ended_with_thinking(
    thread.join(timeout=1)
    return result["reply"] or "Model backend is busy. Try again in a moment."
-def sync_loop(token: str, room_id: str):
+def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
    since = None
    try:
        res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -4861,7 +5016,7 @@ def sync_loop(token: str, room_id: str):
                if not body:
                    continue
                sender = ev.get("sender", "")
-                if sender == f"@{USER}:live.bstein.dev":
+                if account_user and sender == normalize_user_id(account_user):
                    continue
                mentioned = is_mentioned(content, body)
@ -4874,7 +5029,12 @@ def sync_loop(token: str, room_id: str):
                cleaned_body = _strip_bot_mention(body)
                lower_body = cleaned_body.lower()
-                mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
+                mode = _detect_mode(
                    content,
                    body,
                    default=_normalize_mode(default_mode),
                    account_user=account_user,
                )
                # Only do live cluster introspection in DMs.
                allow_tools = is_dm
@ -4938,39 +5098,81 @@ def sync_loop(token: str, room_id: str):
                        snapshot=snapshot,
                        workloads=workloads,
                        history_lines=history[hist_key],
-                        mode=mode if mode in ("fast", "deep") else "deep",
+                        mode=_normalize_mode(mode),
                        allow_tools=allow_tools,
                        context=context,
                    )
                else:
                    reply = _non_cluster_reply(
                        cleaned_body,
                        history_lines=history[hist_key],
-                        mode=mode if mode in ("fast", "deep") else "deep",
+                        mode=_normalize_mode(mode),
                    )
                send_msg(token, rid, reply)
                history[hist_key].append(f"Atlas: {reply}")
                history[hist_key] = history[hist_key][-80:]
-def login_with_retry():
+def login_with_retry(user: str, password: str):
    last_err = None
    for attempt in range(10):
        try:
-            return login()
+            return login(user, password)
        except Exception as exc:  # noqa: BLE001
            last_err = exc
            time.sleep(min(30, 2 ** attempt))
    raise last_err
 def _bot_accounts() -> list[dict[str, str]]:
    accounts: list[dict[str, str]] = []
    def add(user: str, password: str, mode: str):
        if not user or not password:
            return
        accounts.append({"user": user, "password": password, "mode": mode})
    add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "smart")
    if BOT_USER_QUICK and BOT_PASS_QUICK:
        add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
    if BOT_USER_GENIUS and BOT_PASS_GENIUS:
        add(BOT_USER_GENIUS, BOT_PASS_GENIUS, "genius")
    if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
        add(BOT_USER, BOT_PASS, "smart")
    seen: set[str] = set()
    unique: list[dict[str, str]] = []
    for acc in accounts:
        uid = normalize_user_id(acc["user"]).lower()
        if uid in seen:
            continue
        seen.add(uid)
        unique.append(acc)
    return unique
 def main():
    load_kb()
    _start_http_server()
-    token = login_with_retry()
+    accounts = _bot_accounts()
    threads: list[threading.Thread] = []
    for acc in accounts:
        token = login_with_retry(acc["user"], acc["password"])
        try:
            room_id = resolve_alias(token, ROOM_ALIAS)
            join_room(token, room_id)
        except Exception:
            room_id = None
-    sync_loop(token, room_id)
+        thread = threading.Thread(
            target=sync_loop,
            args=(token, room_id),
            kwargs={
                "account_user": acc["user"],
                "default_mode": acc["mode"],
            },
            daemon=True,
        )
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
 if __name__ == "__main__":
    main()
--- a/services/comms/scripts/comms_vault_env.sh
+++ b/services/comms/scripts/comms_vault_env.sh
@ -7,6 +7,14 @@ read_secret() {
  tr -d '\r\n' < "${vault_dir}/$1"
 }
 read_optional() {
  if [ -f "${vault_dir}/$1" ]; then
    tr -d '\r\n' < "${vault_dir}/$1"
  else
    printf ''
  fi
 }
 export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
 export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
@ -14,6 +22,15 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
 export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
 export BOT_PASS="$(read_secret bot-pass)"
 export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
 export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
 export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
 if [ -z "${BOT_PASS_SMART}" ]; then
  export BOT_PASS_SMART="${BOT_PASS}"
 fi
 if [ -z "${BOT_PASS_GENIUS}" ]; then
  export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
 fi
 export SEEDER_PASS="$(read_secret seeder-pass)"
 export CHAT_API_KEY="$(read_secret chat-matrix)"
--- a/services/comms/scripts/tests/test_atlasbot_modes.py
+++ b/services/comms/scripts/tests/test_atlasbot_modes.py
@ -0,0 +1,164 @@
 from __future__ import annotations
 import importlib.util
 import os
 from pathlib import Path
 from unittest import TestCase, mock
 BOT_PATH = Path(__file__).resolve().parents[1] / "atlasbot" / "bot.py"
 def load_bot_module():
    env = {
        "BOT_USER": "atlas-smart",
        "BOT_PASS": "smart-pass",
        "BOT_USER_QUICK": "atlas-quick",
        "BOT_PASS_QUICK": "quick-pass",
        "BOT_USER_SMART": "atlas-smart",
        "BOT_PASS_SMART": "smart-pass",
        "BOT_USER_GENIUS": "atlas-genius",
        "BOT_PASS_GENIUS": "genius-pass",
        "OLLAMA_URL": "http://ollama.invalid",
        "OLLAMA_MODEL": "base-model",
        "ATLASBOT_MODEL_FAST": "fast-model",
        "ATLASBOT_MODEL_SMART": "smart-model",
        "ATLASBOT_MODEL_GENIUS": "genius-model",
        "ATLASBOT_QUICK_TIME_BUDGET_SEC": "15",
        "ATLASBOT_SMART_TIME_BUDGET_SEC": "45",
        "ATLASBOT_GENIUS_TIME_BUDGET_SEC": "180",
        "KB_DIR": "",
        "VM_URL": "http://vm.invalid",
        "ARIADNE_STATE_URL": "",
        "ARIADNE_STATE_TOKEN": "",
    }
    with mock.patch.dict(os.environ, env, clear=False):
        spec = importlib.util.spec_from_file_location("atlasbot_bot", BOT_PATH)
        module = importlib.util.module_from_spec(spec)
        assert spec.loader is not None
        spec.loader.exec_module(module)
        return module
 class AtlasbotModeTests(TestCase):
    def setUp(self):
        self.bot = load_bot_module()
    def test_bot_accounts_include_genius_mode(self):
        accounts = self.bot._bot_accounts()
        by_user = {account["user"]: account["mode"] for account in accounts}
        self.assertEqual(by_user["atlas-quick"], "fast")
        self.assertEqual(by_user["atlas-smart"], "smart")
        self.assertEqual(by_user["atlas-genius"], "genius")
    def test_objective_cluster_question_uses_fact_pack_without_llm(self):
        fact_lines = [
            "hottest_cpu: longhorn-system (6.69)",
            "hottest_ram: longhorn-system (36.05 GB)",
        ]
        with (
            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
            mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
        ):
            reply = self.bot.open_ended_answer(
                "what is the hottest cpu node in titan lab currently?",
                inventory=[],
                snapshot=None,
                workloads=[],
                history_lines=[],
                mode="smart",
                allow_tools=True,
            )
        self.assertIn("longhorn-system", reply)
        self.assertIn("Confidence:", reply)
    def test_subjective_genius_answer_uses_genius_model(self):
        fact_lines = [
            "hottest_cpu: longhorn-system (6.69)",
            "worker_nodes: titan-01, titan-02, titan-03",
        ]
        captured: dict[str, object] = {}
        def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
            captured["model"] = model
            captured["timeout"] = timeout
            captured["context"] = context
            return "The worker spread stands out because Titan keeps meaningful capacity on the same cluster. Confidence: high"
        with (
            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
            mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
        ):
            reply = self.bot.open_ended_answer(
                "what stands out about titan lab?",
                inventory=[],
                snapshot=None,
                workloads=[],
                history_lines=[],
                mode="genius",
                allow_tools=True,
                context='Cluster snapshot (JSON): {"injected":true}',
            )
        self.assertIn("The worker spread stands out", reply)
        self.assertEqual(captured["model"], "genius-model")
        self.assertLessEqual(float(captured["timeout"]), 180.0)
        self.assertIn('Cluster snapshot (JSON): {"injected":true}', str(captured["context"]))
    def test_mode_timeouts_stay_within_budgets(self):
        fact_lines = [
            "hottest_cpu: longhorn-system (6.69)",
            "worker_nodes: titan-01, titan-02, titan-03",
        ]
        seen: list[tuple[str, float]] = []
        def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
            seen.append((str(model), float(timeout or 0)))
            return "Atlas has a clear standout because the worker spread is healthy. Confidence: high"
        with (
            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
            mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
        ):
            for mode in ("fast", "smart", "genius"):
                reply = self.bot.open_ended_answer(
                    "what stands out about titan lab?",
                    inventory=[],
                    snapshot=None,
                    workloads=[],
                    history_lines=[],
                    mode=mode,
                    allow_tools=True,
                )
                self.assertIn("Confidence:", reply)
        self.assertEqual([model for model, _ in seen], ["fast-model", "smart-model", "genius-model"])
        self.assertLessEqual(seen[0][1], 15.0)
        self.assertLessEqual(seen[1][1], 45.0)
        self.assertLessEqual(seen[2][1], 180.0)
    def test_llm_timeout_still_returns_a_conclusion(self):
        fact_lines = [
            "worker_nodes: titan-01, titan-02, titan-03",
            "hottest_cpu: longhorn-system (6.69)",
        ]
        with (
            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
            mock.patch.object(self.bot, "_ollama_call", side_effect=TimeoutError("simulated timeout")),
        ):
            reply = self.bot.open_ended_answer(
                "what stands out about the worker nodes?",
                inventory=[],
                snapshot=None,
                workloads=[],
                history_lines=[],
                mode="genius",
                allow_tools=True,
            )
        self.assertIn("worker nodes", reply.lower())
        self.assertIn("Confidence:", reply)
--- a/services/comms/seed-othrys-room.yaml
+++ b/services/comms/seed-othrys-room.yaml
@ -66,7 +66,7 @@ spec:
                - name: SEEDER_USER
                  value: othrys-seeder
                - name: BOT_USER
-                  value: atlasbot
+                  value: atlas-smart
              command:
                - /bin/sh
                - -c
--- a/services/crypto/monerod/deployment.yaml
+++ b/services/crypto/monerod/deployment.yaml
@ -29,12 +29,18 @@ spec:
                operator: In
                values: ["rpi4","rpi5"]
          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 50
+            - weight: 80
              preference:
                matchExpressions:
                - key: hardware
                  operator: In
-                  values: ["rpi4"]
+                  values: ["rpi5"]
            - weight: 60
              preference:
                matchExpressions:
                - key: kubernetes.io/hostname
                  operator: NotIn
                  values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
      containers:
      - name: monerod
        image: registry.bstein.dev/crypto/monerod:0.18.4.1
--- a/services/crypto/xmr-miner/xmrig-daemonset.yaml
+++ b/services/crypto/xmr-miner/xmrig-daemonset.yaml
@ -23,7 +23,7 @@ spec:
            - matchExpressions:
              - key: hardware
                operator: In
-                values: ["rpi4","rpi5"]
+                values: ["rpi5"]
      containers:
        - name: xmrig
          image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
--- a/services/gitea/deployment.yaml
+++ b/services/gitea/deployment.yaml
@ -123,13 +123,22 @@ spec:
              - key: hardware
                operator: In
                values: ["rpi4","rpi5"]
              - key: longhorn
                operator: NotIn
                values: ["true"]
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              preference:
                matchExpressions:
                - key: kubernetes.io/hostname
                  operator: NotIn
                  values: ["titan-13","titan-15","titan-17","titan-19"]
            - weight: 50
              preference:
                matchExpressions:
                - key: hardware
                  operator: In
-                  values: ["rpi4"]
+                  values: ["rpi5"]
      containers:
        - name: gitea
          image: gitea/gitea:1.23
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@ -245,6 +245,17 @@ spec:
        image:
          repository: registry.bstein.dev/infra/harbor-registry
          tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
        extraEnvVars:
          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
            value: harbor-core
          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
            value: http://harbor-registry:8080/service/notifications
          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
            value: 5s
          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
            value: "5"
          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
            value: 1s
      controller:
        image:
          repository: registry.bstein.dev/infra/harbor-registryctl
@ -263,6 +274,10 @@ spec:
          export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
          export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
          {{ end }}
          {{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
          export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
          export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
          {{ end }}
        vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
        vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
          {{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -397,10 +412,10 @@ spec:
            patch: |-
              - op: replace
                path: /spec/rules/0/http/paths/2/backend/service/name
-                value: harbor-registry
+                value: harbor-core
              - op: replace
                path: /spec/rules/0/http/paths/2/backend/service/port/number
-                value: 5000
+                value: 80
          - target:
              kind: Deployment
              name: harbor-jobservice
@ -422,8 +437,7 @@ spec:
                          - $patch: replace
                          - name: VAULT_ENV_FILE
                            value: /vault/secrets/harbor-jobservice-env.sh
-                        envFrom:
+                        envFrom: []
                          - $patch: replace
                          - configMapRef:
                              name: harbor-jobservice-env
                        volumeMounts:
@ -464,6 +478,16 @@ spec:
                            value: /vault/secrets/harbor-registry-env.sh
                          - name: VAULT_COPY_FILES
                            value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
                            value: harbor-core
                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
                            value: http://harbor-registry:8080/service/notifications
                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
                            value: 5s
                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
                            value: "5"
                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
                            value: 1s
                        envFrom:
                          - $patch: replace
                        volumeMounts:
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@ -67,7 +67,7 @@ data:
                      url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
                      credentials('gitea-pat')
                    }
-                    branches('*/master')
+                    branches('*/main')
                  }
                }
              }
@ -108,7 +108,7 @@ data:
                      url('https://scm.bstein.dev/bstein/ci-demo.git')
                      credentials('gitea-pat')
                    }
-                    branches('*/master')
+                    branches('*/main')
                  }
                }
                scriptPath('Jenkinsfile')
@ -167,6 +167,110 @@ data:
              }
            }
          }
          pipelineJob('metis') {
            properties {
              pipelineTriggers {
                triggers {
                  scmTrigger {
                    scmpoll_spec('H/2 * * * *')
                    ignorePostCommitHooks(false)
                  }
                }
              }
            }
            definition {
              cpsScm {
                scm {
                  git {
                    remote {
                      url('https://scm.bstein.dev/bstein/metis.git')
                      credentials('gitea-pat')
                    }
                    branches('*/master')
                  }
                }
                scriptPath('Jenkinsfile')
              }
            }
          }
          pipelineJob('metis') {
            properties {
              pipelineTriggers {
                triggers {
                  scmTrigger {
                    scmpoll_spec('H/5 * * * *')
                    ignorePostCommitHooks(false)
                  }
                }
              }
            }
            definition {
              cpsScm {
                scm {
                  git {
                    remote {
                      url('https://scm.bstein.dev/bstein/metis.git')
                      credentials('gitea-pat')
                    }
                    branches('*/master')
                  }
                }
                scriptPath('Jenkinsfile')
              }
            }
          }
          pipelineJob('atlasbot') {
            properties {
              pipelineTriggers {
                triggers {
                  scmTrigger {
                    scmpoll_spec('H/2 * * * *')
                    ignorePostCommitHooks(false)
                  }
                }
              }
            }
            definition {
              cpsScm {
                scm {
                  git {
                    remote {
                      url('https://scm.bstein.dev/bstein/atlasbot.git')
                      credentials('gitea-pat')
                    }
                    branches('*/main')
                  }
                }
                scriptPath('Jenkinsfile')
              }
            }
          }
          pipelineJob('Soteria') {
            properties {
              pipelineTriggers {
                triggers {
                  scmTrigger {
                    scmpoll_spec('H/5 * * * *')
                    ignorePostCommitHooks(false)
                  }
                }
              }
            }
            definition {
              cpsScm {
                scm {
                  git {
                    remote {
                      url('https://scm.bstein.dev/bstein/soteria.git')
                      credentials('gitea-pat')
                    }
                    branches('*/main')
                  }
                }
                scriptPath('Jenkinsfile')
              }
            }
          }
          pipelineJob('data-prepper') {
            properties {
              pipelineTriggers {
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@ -48,7 +48,7 @@ spec:
          TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
          GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
          {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
+        bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
    spec:
      serviceAccountName: jenkins
      nodeSelector:
--- a/services/jenkins/dind-pvc.yaml
+++ b/services/jenkins/dind-pvc.yaml
@ -0,0 +1,13 @@
 # services/jenkins/dind-pvc.yaml
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: jenkins-dind-cache
  namespace: jenkins
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 30Gi
  storageClassName: astreae
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@ -8,6 +8,7 @@ resources:
  - vault-serviceaccount.yaml
  - pvc.yaml
  - cache-pvc.yaml
  - dind-pvc.yaml
  - plugins-pvc.yaml
  - configmap-jcasc.yaml
  - configmap-plugins.yaml
--- a/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
@ -1,12 +1,12 @@
 # services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
-# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
+# One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
-# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
+# Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-portal-e2e-execute-actions-email-14
+  name: keycloak-portal-e2e-execute-actions-email-18
  namespace: sso
 spec:
  suspend: true
@ -70,7 +70,7 @@ spec:
            - name: E2E_PROBE_USERNAME
              value: robotuser
            - name: E2E_PROBE_EMAIL
-              value: robotuser@bstein.dev
+              value: brad.stein+robot@gmail.com
            - name: EXECUTE_ACTIONS_CLIENT_ID
              value: bstein-dev-home
            - name: EXECUTE_ACTIONS_REDIRECT_URI
--- a/services/keycloak/oneoffs/realm-settings-job.yaml
+++ b/services/keycloak/oneoffs/realm-settings-job.yaml
@ -1,12 +1,12 @@
 # services/keycloak/oneoffs/realm-settings-job.yaml
-# One-off job for sso/keycloak-realm-settings-36.
+# One-off job for sso/keycloak-realm-settings-38.
-# Purpose: keycloak realm settings 36 (see container args/env in this file).
+# Purpose: keycloak realm settings 38 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-36
+  name: keycloak-realm-settings-38
  namespace: sso
 spec:
  suspend: true
@ -64,7 +64,7 @@ spec:
            - name: KEYCLOAK_REALM
              value: atlas
            - name: KEYCLOAK_SMTP_HOST
-              value: mail.bstein.dev
+              value: smtp.postmarkapp.com
            - name: KEYCLOAK_SMTP_PORT
              value: "587"
            - name: KEYCLOAK_SMTP_FROM
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@ -18,6 +18,7 @@ spec:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
        maintenance.bstein.dev/restart-rev: "20260207-2"
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "maintenance"
        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -105,7 +106,7 @@ spec:
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: ariadne
-          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          image: registry.bstein.dev/bstein/ariadne:latest
          imagePullPolicy: Always
          command: ["/bin/sh", "-c"]
          args:
@ -285,7 +286,7 @@ spec:
            - name: ARIADNE_SCHEDULE_MAILU_SYNC
              value: "30 4 * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -293,23 +294,23 @@ spec:
            - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
              value: "0 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_ADMIN
              value: "15 3 * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
-              value: "0 6 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_CRON
              value: "0 3 * * *"
            - name: ARIADNE_SCHEDULE_POD_CLEANER
-              value: "0 * * * *"
+              value: "*/30 * * * *"
            - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
              value: "23 3 * * *"
            - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
-              value: "30 4 * * 0"
+              value: "0 */4 * * *"
            - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
-              value: "0 * * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_VAULT_OIDC
-              value: "0 * * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -319,9 +320,9 @@ spec:
            - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
              value: "*/10 * * * *"
            - name: ARIADNE_SCHEDULE_CLUSTER_STATE
-              value: "*/15 * * * *"
+              value: "*/10 * * * *"
            - name: ARIADNE_CLUSTER_STATE_KEEP
-              value: "168"
+              value: "720"
            - name: WELCOME_EMAIL_ENABLED
              value: "true"
            - name: K8S_API_TIMEOUT_SEC
@ -330,12 +331,20 @@ spec:
              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
            - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
              value: "5"
            - name: ARIADNE_ALERTMANAGER_URL
              value: http://alertmanager.monitoring.svc.cluster.local
            - name: OPENSEARCH_URL
              value: http://opensearch-master.logging.svc.cluster.local:9200
            - name: OPENSEARCH_LIMIT_BYTES
              value: "1099511627776"
            - name: OPENSEARCH_INDEX_PATTERNS
              value: kube-*,journald-*,trace-analytics-*
            - name: METIS_BASE_URL
              value: http://metis.maintenance.svc.cluster.local
            - name: METIS_TIMEOUT_SEC
              value: "15"
            - name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
              value: "*/30 * * * *"
            - name: METRICS_PATH
              value: "/metrics"
          resources:
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@ -29,6 +29,29 @@ rules:
      - get
      - list
      - watch
  - apiGroups: ["apps"]
    resources:
      - deployments
      - statefulsets
      - daemonsets
    verbs:
      - get
      - list
      - watch
  - apiGroups: ["longhorn.io"]
    resources:
      - volumes
    verbs:
      - get
      - list
      - watch
  - apiGroups: [""]
    resources:
      - events
    verbs:
      - get
      - list
      - watch
  - apiGroups: [""]
    resources:
      - pods/exec
@ -56,3 +79,17 @@ roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: ariadne-job-spawner
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: ariadne-auth-delegator
 subjects:
  - kind: ServiceAccount
    name: ariadne
    namespace: maintenance
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:auth-delegator
--- a/services/maintenance/image.yaml
+++ b/services/maintenance/image.yaml
@ -21,3 +21,72 @@ spec:
  policy:
    semver:
      range: ">=0.1.0-0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
 metadata:
  name: metis
  namespace: maintenance
 spec:
  image: registry.bstein.dev/bstein/metis
  interval: 1m0s
  secretRef:
    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
 metadata:
  name: metis
  namespace: maintenance
 spec:
  imageRepositoryRef:
    name: metis
  policy:
    semver:
      range: ">=0.1.0-0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
 metadata:
  name: metis-sentinel
  namespace: maintenance
 spec:
  image: registry.bstein.dev/bstein/metis-sentinel
  interval: 1m0s
  secretRef:
    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
 metadata:
  name: metis-sentinel
  namespace: maintenance
 spec:
  imageRepositoryRef:
    name: metis-sentinel
  policy:
    semver:
      range: ">=0.1.0-0"
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImageRepository
 metadata:
  name: soteria
  namespace: maintenance
 spec:
  image: registry.bstein.dev/bstein/soteria
  interval: 1m0s
  secretRef:
    name: harbor-regcred
 ---
 apiVersion: image.toolkit.fluxcd.io/v1beta2
 kind: ImagePolicy
 metadata:
  name: soteria
  namespace: maintenance
 spec:
  imageRepositoryRef:
    name: soteria
  policy:
    semver:
      range: ">=0.1.0-0"
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@ -5,28 +5,50 @@ resources:
  - namespace.yaml
  - image.yaml
  - secretproviderclass.yaml
  - soteria-configmap.yaml
  - metis-configmap.yaml
  - metis-data-pvc.yaml
  - vault-serviceaccount.yaml
  - vault-sync-deployment.yaml
  - ariadne-serviceaccount.yaml
  - ariadne-rbac.yaml
  - disable-k3s-traefik-serviceaccount.yaml
  - k3s-traefik-cleanup-rbac.yaml
  - metis-serviceaccount.yaml
  - metis-rbac.yaml
  - metis-token-sync-serviceaccount.yaml
  - metis-token-sync-rbac.yaml
  - node-nofile-serviceaccount.yaml
  - pod-cleaner-rbac.yaml
  - soteria-serviceaccount.yaml
  - soteria-rbac.yaml
  - ariadne-deployment.yaml
  - metis-deployment.yaml
  - oneoffs/ariadne-migrate-job.yaml
  - ariadne-service.yaml
  - soteria-deployment.yaml
  - disable-k3s-traefik-daemonset.yaml
  - oneoffs/k3s-traefik-cleanup-job.yaml
  - node-nofile-daemonset.yaml
  - metis-sentinel-daemonset.yaml
  - metis-k3s-token-sync-cronjob.yaml
  - k3s-agent-restart-daemonset.yaml
  - pod-cleaner-cronjob.yaml
  - node-image-sweeper-serviceaccount.yaml
  - node-image-sweeper-daemonset.yaml
  - image-sweeper-cronjob.yaml
  - metis-service.yaml
  - metis-ingress.yaml
  - soteria-service.yaml
 images:
  - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
  - name: registry.bstein.dev/bstein/metis
    newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis:tag"}
  - name: registry.bstein.dev/bstein/metis-sentinel
    newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis-sentinel:tag"}
  - name: registry.bstein.dev/bstein/soteria
    newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
 configMapGenerator:
  - name: disable-k3s-traefik-script
    namespace: maintenance
--- a/services/maintenance/metis-configmap.yaml
+++ b/services/maintenance/metis-configmap.yaml
@ -0,0 +1,20 @@
 # services/maintenance/metis-configmap.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: metis
  namespace: maintenance
 data:
  METIS_BIND_ADDR: :8080
  METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
  METIS_DATA_DIR: /var/lib/metis
  METIS_DEFAULT_FLASH_HOST: titan-22
  METIS_FLASH_HOSTS: titan-22
  METIS_LOCAL_HOST: titan-22
  METIS_ALLOWED_GROUPS: admin,maintainer
  METIS_MAX_DEVICE_BYTES: "300000000000"
  METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot
  METIS_SENTINEL_INTERVAL_SEC: "1800"
  METIS_SENTINEL_NSENTER: "1"
  METIS_IMAGE_RPI4_ARMBIAN_LONGHORN: https://armbian.chi.auroradev.org/dl/rpi4b/archive/Armbian_26.2.1_Rpi4b_noble_current_6.18.9_minimal.img.xz
  METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256: sha256:c450687adf4cc6a59725c43aefd58baf42ec71bdd379227d403cdde281768e46
--- a/services/maintenance/metis-data-pvc.yaml
+++ b/services/maintenance/metis-data-pvc.yaml
@ -0,0 +1,13 @@
 # services/maintenance/metis-data-pvc.yaml
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: metis-data
  namespace: maintenance
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 40Gi
  storageClassName: local-path
--- a/services/maintenance/metis-deployment.yaml
+++ b/services/maintenance/metis-deployment.yaml
@ -0,0 +1,47 @@
 # services/maintenance/metis-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: metis
  namespace: maintenance
 spec:
  replicas: 1
  revisionHistoryLimit: 3
  selector:
    matchLabels:
      app: metis
  template:
    metadata:
      labels:
        app: metis
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      serviceAccountName: metis
      nodeSelector:
        kubernetes.io/hostname: titan-22
        kubernetes.io/arch: amd64
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: metis
          image: registry.bstein.dev/bstein/metis:latest
          imagePullPolicy: Always
          envFrom:
            - configMapRef:
                name: metis
          ports:
            - name: http
              containerPort: 8080
          resources:
            requests:
              cpu: 100m
              memory: 128Mi
            limits:
              cpu: 500m
              memory: 512Mi
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: ["ALL"]
--- a/services/maintenance/metis-ingress.yaml
+++ b/services/maintenance/metis-ingress.yaml
@ -0,0 +1,27 @@
 # services/maintenance/metis-ingress.yaml
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: metis
  namespace: maintenance
  annotations:
    kubernetes.io/ingress.class: traefik
    cert-manager.io/cluster-issuer: letsencrypt
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.tls: "true"
    traefik.ingress.kubernetes.io/router.middlewares: sso-oauth2-proxy-forward-auth@kubernetescrd
 spec:
  tls:
    - hosts: ["metis.bstein.dev"]
      secretName: metis-tls
  rules:
    - host: metis.bstein.dev
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: metis
                port:
                  number: 80
--- a/services/maintenance/metis-k3s-token-sync-cronjob.yaml
+++ b/services/maintenance/metis-k3s-token-sync-cronjob.yaml
@ -0,0 +1,51 @@
 # services/maintenance/metis-k3s-token-sync-cronjob.yaml
 apiVersion: batch/v1
 kind: CronJob
 metadata:
  name: metis-k3s-token-sync
  namespace: maintenance
 spec:
  schedule: "11 */6 * * *"
  concurrencyPolicy: Forbid
  successfulJobsHistoryLimit: 1
  failedJobsHistoryLimit: 2
  jobTemplate:
    spec:
      template:
        spec:
          serviceAccountName: metis-token-sync
          restartPolicy: OnFailure
          nodeSelector:
            kubernetes.io/arch: arm64
            node-role.kubernetes.io/control-plane: "true"
          tolerations:
            - key: node-role.kubernetes.io/control-plane
              operator: Exists
              effect: NoSchedule
            - key: node-role.kubernetes.io/master
              operator: Exists
              effect: NoSchedule
          containers:
            - name: sync
              image: registry.bstein.dev/bstein/kubectl:1.35.0
              imagePullPolicy: IfNotPresent
              command:
                - /bin/sh
                - -c
              args:
                - |
                  set -euo pipefail
                  token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/node-token)"
                  kubectl -n maintenance create secret generic metis-runtime \
                    --from-literal=k3s_token="${token}" \
                    --dry-run=client -o yaml | kubectl apply -f -
              securityContext:
                runAsUser: 0
              volumeMounts:
                - name: k3s-server
                  mountPath: /host/var/lib/rancher/k3s/server
                  readOnly: true
          volumes:
            - name: k3s-server
              hostPath:
                path: /var/lib/rancher/k3s/server
--- a/services/maintenance/metis-rbac.yaml
+++ b/services/maintenance/metis-rbac.yaml
@ -0,0 +1,27 @@
 # services/maintenance/metis-rbac.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: metis-node-manager
 rules:
  - apiGroups: [""]
    resources:
      - nodes
    verbs:
      - get
      - list
      - watch
      - delete
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: metis-node-manager
 subjects:
  - kind: ServiceAccount
    name: metis
    namespace: maintenance
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: metis-node-manager
--- a/services/maintenance/metis-sentinel-daemonset.yaml
+++ b/services/maintenance/metis-sentinel-daemonset.yaml
@ -0,0 +1,133 @@
 # services/maintenance/metis-sentinel-daemonset.yaml
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: metis-sentinel
  namespace: maintenance
 spec:
  selector:
    matchLabels:
      app: metis-sentinel
  updateStrategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        app: metis-sentinel
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
    spec:
      serviceAccountName: metis
      nodeSelector:
        kubernetes.io/os: linux
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: metis-sentinel
          image: registry.bstein.dev/bstein/metis-sentinel:latest
          imagePullPolicy: Always
          command:
            - /bin/sh
            - -c
          args:
            - |
              set -eu
              out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
              interval="${METIS_SENTINEL_INTERVAL_SEC:-120}"
              mkdir -p "${out_dir}"
              while true; do
                ts="$(date -u +%Y%m%dT%H%M%SZ)"
                node="${METIS_SENTINEL_NODE:-unknown}"
                tmp="${out_dir}/${node}-${ts}.json.tmp"
                out="${out_dir}/${node}-${ts}.json"
                if metis-sentinel > "${tmp}"; then
                  mv "${tmp}" "${out}"
                else
                  rm -f "${tmp}" || true
                fi
                sleep "${interval}"
              done
          envFrom:
            - configMapRef:
                name: metis
          env:
            - name: METIS_SENTINEL_NODE
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
          ports:
            - name: http
              containerPort: 8080
          volumeMounts:
            - name: sentinel-output
              mountPath: /var/run/metis-sentinel
          resources:
            requests:
              cpu: 25m
              memory: 64Mi
            limits:
              cpu: 250m
              memory: 256Mi
          securityContext:
            allowPrivilegeEscalation: false
            runAsUser: 0
            capabilities:
              drop: ["ALL"]
        - name: sentinel-pusher
          image: curlimages/curl:8.12.1
          imagePullPolicy: IfNotPresent
          command:
            - /bin/sh
            - -c
          args:
            - |
              set -eu
              out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
              push_url="${METIS_SENTINEL_PUSH_URL:-}"
              interval="${METIS_SENTINEL_PUSH_INTERVAL_SEC:-120}"
              timeout="${METIS_SENTINEL_PUSH_TIMEOUT_SEC:-10}"
              mkdir -p "${out_dir}"
              while true; do
                for snapshot in "${out_dir}"/*.json; do
                  [ -f "${snapshot}" ] || continue
                  if [ -z "${push_url}" ]; then
                    break
                  fi
                  if curl -fsS --connect-timeout "${timeout}" --max-time "${timeout}" \
                    -X POST \
                    -H "Content-Type: application/json" \
                    -H "X-Metis-Node: ${METIS_SENTINEL_NODE:-unknown}" \
                    --data-binary "@${snapshot}" \
                    "${push_url}"; then
                    rm -f "${snapshot}"
                  fi
                done
                sleep "${interval}"
              done
          envFrom:
            - configMapRef:
                name: metis
          env:
            - name: METIS_SENTINEL_NODE
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
          volumeMounts:
            - name: sentinel-output
              mountPath: /var/run/metis-sentinel
          resources:
            requests:
              cpu: 10m
              memory: 32Mi
            limits:
              cpu: 100m
              memory: 128Mi
          securityContext:
            allowPrivilegeEscalation: false
            runAsUser: 0
            capabilities:
              drop: ["ALL"]
      volumes:
        - name: sentinel-output
          emptyDir: {}
--- a/services/maintenance/metis-service.yaml
+++ b/services/maintenance/metis-service.yaml
@ -0,0 +1,18 @@
 # services/maintenance/metis-service.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: metis
  namespace: maintenance
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "80"
    prometheus.io/path: "/metrics"
 spec:
  type: ClusterIP
  selector:
    app: metis
  ports:
    - name: http
      port: 80
      targetPort: http
--- a/services/maintenance/metis-serviceaccount.yaml
+++ b/services/maintenance/metis-serviceaccount.yaml
@ -0,0 +1,6 @@
 # services/maintenance/metis-serviceaccount.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: metis
  namespace: maintenance
--- a/services/maintenance/metis-token-sync-rbac.yaml
+++ b/services/maintenance/metis-token-sync-rbac.yaml
@ -0,0 +1,30 @@
 # services/maintenance/metis-token-sync-rbac.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: metis-token-sync
  namespace: maintenance
 rules:
  - apiGroups: [""]
    resources:
      - secrets
    verbs:
      - get
      - list
      - create
      - update
      - patch
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: metis-token-sync
  namespace: maintenance
 subjects:
  - kind: ServiceAccount
    name: metis-token-sync
    namespace: maintenance
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: metis-token-sync
--- a/services/maintenance/metis-token-sync-serviceaccount.yaml
+++ b/services/maintenance/metis-token-sync-serviceaccount.yaml
@ -0,0 +1,6 @@
 # services/maintenance/metis-token-sync-serviceaccount.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: metis-token-sync
  namespace: maintenance
--- a/services/maintenance/node-image-sweeper-daemonset.yaml
+++ b/services/maintenance/node-image-sweeper-daemonset.yaml
@ -10,6 +10,8 @@ spec:
      app: node-image-sweeper
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 100%
  template:
    metadata:
      labels:
@ -29,6 +31,21 @@ spec:
        - name: node-image-sweeper
          image: python:3.12.9-alpine3.20
          command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
          env:
            - name: SWEEP_INTERVAL_SEC
              value: "21600"
            - name: HIGH_USAGE_PERCENT
              value: "70"
            - name: EMERGENCY_USAGE_PERCENT
              value: "80"
            - name: BASE_THRESHOLD_DAYS
              value: "14"
            - name: HIGH_USAGE_THRESHOLD_DAYS
              value: "3"
            - name: LOG_RETENTION_DAYS
              value: "7"
            - name: JOURNAL_MAX_SIZE
              value: "200M"
          securityContext:
            privileged: true
            runAsUser: 0
--- a/services/maintenance/scripts/node_image_sweeper.sh
+++ b/services/maintenance/scripts/node_image_sweeper.sh
@ -2,26 +2,39 @@
 set -eu
 ONE_SHOT=${ONE_SHOT:-false}
-THRESHOLD_DAYS=14
+SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
 BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
 HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
 HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
 EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
 LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
 JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
 SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
-usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
+sweep_once() {
-if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
+  usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
-  THRESHOLD_DAYS=3
+  threshold_days="${BASE_THRESHOLD_DAYS}"
-fi
+  if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
    threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
  fi
-cutoff=$(python3 - <<'PY'
+  cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
-import time, os
+import os
-print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
+import time
 days = int(os.environ.get("THRESHOLD_DAYS", "14"))
 print(int(time.time()) - days * 86400)
 PY
 )
-RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
+  RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
-IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
+  IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
-SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
+  prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
-
+import json
-prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
+import os
-import json, os, sys, time
+import sys
 import time
 try:
    data = json.load(sys.stdin)
@ -74,19 +87,33 @@ for p in prune:
 PY
 )
-if [ -n "${prune_list}" ]; then
+  if [ -n "${prune_list}" ]; then
    printf "%s" "${prune_list}" | while read -r image_id; do
      if [ -n "${image_id}" ]; then
        chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
      fi
    done
-fi
+  fi
-find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
+  find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
-find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
+  find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
  if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
    # Emergency pass for rootfs pressure on SD-backed nodes.
    chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
    find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
    find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
    chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
  fi
 }
 sweep_once
 if [ "${ONE_SHOT}" = "true" ]; then
  exit 0
 fi
-sleep infinity
+while true; do
  sleep "${SWEEP_INTERVAL_SEC}"
  sweep_once
 done
--- a/services/maintenance/soteria-configmap.yaml
+++ b/services/maintenance/soteria-configmap.yaml
@ -0,0 +1,10 @@
 # services/maintenance/soteria-configmap.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: soteria
  namespace: maintenance
 data:
  SOTERIA_BACKUP_DRIVER: "longhorn"
  SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
  SOTERIA_LONGHORN_BACKUP_MODE: "incremental"
--- a/services/maintenance/soteria-deployment.yaml
+++ b/services/maintenance/soteria-deployment.yaml
@ -0,0 +1,73 @@
 # services/maintenance/soteria-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: soteria
  namespace: maintenance
 spec:
  replicas: 1
  revisionHistoryLimit: 3
  selector:
    matchLabels:
      app: soteria
  template:
    metadata:
      labels:
        app: soteria
    spec:
      serviceAccountName: soteria
      nodeSelector:
        kubernetes.io/arch: arm64
        node-role.kubernetes.io/worker: "true"
      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 90
              preference:
                matchExpressions:
                  - key: hardware
                    operator: In
                    values: ["rpi5"]
            - weight: 50
              preference:
                matchExpressions:
                  - key: hardware
                    operator: In
                    values: ["rpi4"]
      containers:
        - name: soteria
          image: registry.bstein.dev/bstein/soteria:latest
          imagePullPolicy: Always
          ports:
            - name: http
              containerPort: 8080
          envFrom:
            - configMapRef:
                name: soteria
          livenessProbe:
            httpGet:
              path: /healthz
              port: http
            initialDelaySeconds: 5
            periodSeconds: 10
            timeoutSeconds: 2
          readinessProbe:
            httpGet:
              path: /readyz
              port: http
            initialDelaySeconds: 2
            periodSeconds: 5
            timeoutSeconds: 2
          resources:
            requests:
              cpu: 50m
              memory: 64Mi
            limits:
              cpu: 200m
              memory: 256Mi
          securityContext:
            allowPrivilegeEscalation: false
            runAsNonRoot: true
            runAsUser: 65532
            capabilities:
              drop: ["ALL"]
--- a/services/maintenance/soteria-rbac.yaml
+++ b/services/maintenance/soteria-rbac.yaml
@ -0,0 +1,22 @@
 # services/maintenance/soteria-rbac.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: soteria
 rules:
  - apiGroups: [""]
    resources: ["persistentvolumeclaims", "persistentvolumes"]
    verbs: ["get", "list"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: soteria
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: soteria
 subjects:
  - kind: ServiceAccount
    name: soteria
    namespace: maintenance
--- a/services/maintenance/soteria-service.yaml
+++ b/services/maintenance/soteria-service.yaml
@ -0,0 +1,14 @@
 # services/maintenance/soteria-service.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: soteria
  namespace: maintenance
 spec:
  type: ClusterIP
  selector:
    app: soteria
  ports:
    - name: http
      port: 80
      targetPort: http
--- a/services/maintenance/soteria-serviceaccount.yaml
+++ b/services/maintenance/soteria-serviceaccount.yaml
@ -0,0 +1,8 @@
 # services/maintenance/soteria-serviceaccount.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: soteria
  namespace: maintenance
 imagePullSecrets:
  - name: harbor-regcred
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@ -20,7 +20,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -89,7 +89,7 @@
      },
      "targets": [
        {
-          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
+          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@ -1125,7 +1125,7 @@
    {
      "id": 17,
      "type": "stat",
-      "title": "Ariadne CI Coverage (%)",
+      "title": "Platform CI Coverage (%)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1138,7 +1138,7 @@
      },
      "targets": [
        {
-          "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+          "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
          "refId": "A",
          "legendFormat": "{{branch}}",
          "instant": true
@ -1183,12 +1183,13 @@
          "values": false
        },
        "textMode": "value"
-      }
+      },
      "description": "Internal source panel for Atlas Overview automation test rollups."
    },
    {
      "id": 18,
      "type": "table",
-      "title": "Ariadne CI Tests (latest)",
+      "title": "Platform CI Tests (latest)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1201,7 +1202,7 @@
      },
      "targets": [
        {
-          "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+          "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
          "refId": "A",
          "instant": true
        }
@ -1233,7 +1234,8 @@
            "order": "desc"
          }
        }
-      ]
+      ],
      "description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
    }
  ],
  "time": {
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -1677,7 +1677,7 @@
    {
      "id": 42,
      "type": "timeseries",
-      "title": "Ariadne Test Success Rate",
+      "title": "Platform Test Success Rate",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1690,7 +1690,7 @@
      },
      "targets": [
        {
-          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
+          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
          "refId": "A"
        }
      ],
@ -1709,12 +1709,13 @@
        "tooltip": {
          "mode": "multi"
        }
-      }
+      },
      "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
    },
    {
      "id": 43,
      "type": "bargauge",
-      "title": "Tests with Failures (24h)",
+      "title": "Platform Tests with Failures (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1727,7 +1728,7 @@
      },
      "targets": [
        {
-          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
+          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
          "refId": "A",
          "legendFormat": "{{result}}",
          "instant": true
@ -1814,7 +1815,8 @@
            "order": "desc"
          }
        }
-      ]
+      ],
      "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
    },
    {
      "id": 11,
@ -1901,7 +1903,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@ -22,7 +22,24 @@ data:
      - orgId: 1
        receiver: email-admins
        group_by:
          - grafana_folder
          - alertname
        group_wait: 1m
        group_interval: 30m
        repeat_interval: 12h
        routes:
          - receiver: email-admins
            object_matchers:
              - [severity, "=", "critical"]
            group_wait: 30s
            group_interval: 5m
            repeat_interval: 2h
          - receiver: email-admins
            object_matchers:
              - [severity, "=", "warning"]
            group_wait: 5m
            group_interval: 2h
            repeat_interval: 24h
  rules.yaml: |
    apiVersion: 1
    groups:
@ -32,7 +49,7 @@ data:
        interval: 1m
        rules:
          - uid: disk-pressure-root
-            title: "Node rootfs high (>80%)"
+            title: "Node rootfs high (>85%)"
            condition: C
            for: "10m"
            data:
@ -66,7 +83,7 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [80]
+                        params: [85]
                        type: gt
                      operator:
                        type: and
@ -76,7 +93,7 @@ data:
            noDataState: NoData
            execErrState: Error
            annotations:
-              summary: "{{ $labels.node }} rootfs >80% for 10m"
+              summary: "{{ $labels.node }} rootfs >85% for 10m"
            labels:
              severity: warning
          - uid: disk-growth-1h
@ -145,7 +162,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
+                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
                  legendFormat: '{{instance}}'
                  datasource:
                    type: prometheus
@ -286,8 +303,8 @@ data:
              summary: "node-image-sweeper not fully ready"
            labels:
              severity: warning
-          - uid: maint-cron-stale
+          - uid: maint-ariadne-image-sweeper-stale
-            title: "Maintenance CronJobs stale (>3h since success)"
+            title: "Ariadne image sweeper stale (schedule >8d)"
            condition: C
            for: "5m"
            data:
@ -297,10 +314,10 @@ data:
                  to: 0
                datasourceUid: atlas-vm
                model:
-                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
+                  expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  legendFormat: '{{cronjob}}'
+                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
@ -321,17 +338,166 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [10800]
+                        params: [691200]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
-            noDataState: NoData
+            noDataState: OK
            execErrState: Error
            annotations:
-              summary: "Maintenance cronjob stale >3h since last success"
+              summary: "Ariadne image sweeper stale >8d since last success"
            labels:
              severity: warning
          - uid: maint-cron-stale
            title: "Maintenance CronJobs stale (legacy disabled)"
            condition: C
            for: "5m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: vector(0)
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: legacy
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: OK
            annotations:
              summary: "Legacy cronjob alert disabled"
            labels:
              severity: info
      - orgId: 1
        name: ariadne
        folder: Alerts
        interval: 1m
        rules:
          - uid: ariadne-schedule-error
            title: "Ariadne schedule task failed"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [1]
                        type: lt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Ariadne schedule failed ({{ $labels.task }})"
            labels:
              severity: warning
          - uid: ariadne-scheduler-stalled
            title: "Ariadne scheduler behind (>15m)"
            condition: C
            for: "10m"
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: atlas-vm
                model:
                  expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
                  intervalMs: 60000
                  maxDataPoints: 43200
                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
              - refId: B
                datasourceUid: __expr__
                model:
                  expression: A
                  intervalMs: 60000
                  maxDataPoints: 43200
                  reducer: last
                  type: reduce
              - refId: C
                datasourceUid: __expr__
                model:
                  expression: B
                  intervalMs: 60000
                  maxDataPoints: 43200
                  type: threshold
                  conditions:
                    - evaluator:
                        params: [900]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Ariadne scheduler behind for {{ $labels.task }}"
            labels:
              severity: warning
      - orgId: 1
@ -352,7 +518,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
+                  expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
                  legendFormat: bounce 1d
                  datasource:
                    type: prometheus
@ -381,7 +547,7 @@ data:
                      reducer:
                        type: last
                      type: query
-            noDataState: NoData
+            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Postmark 1d bounce rate >5%"
@ -400,7 +566,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: POSTMARK_API_UP
+                  expr: max(postmark_api_up) or on() vector(0)
                  legendFormat: api up
                  datasource:
                    type: prometheus
@ -429,7 +595,7 @@ data:
                      reducer:
                        type: last
                      type: query
-            noDataState: NoData
+            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Postmark exporter reports API down"
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@ -29,7 +29,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -98,7 +98,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
+              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@ -1134,7 +1134,7 @@ data:
        {
          "id": 17,
          "type": "stat",
-          "title": "Ariadne CI Coverage (%)",
+          "title": "Platform CI Coverage (%)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
          },
          "targets": [
            {
-              "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+              "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
              "refId": "A",
              "legendFormat": "{{branch}}",
              "instant": true
@ -1192,12 +1192,13 @@ data:
              "values": false
            },
            "textMode": "value"
-          }
+          },
          "description": "Internal source panel for Atlas Overview automation test rollups."
        },
        {
          "id": 18,
          "type": "table",
-          "title": "Ariadne CI Tests (latest)",
+          "title": "Platform CI Tests (latest)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1210,7 +1211,7 @@ data:
          },
          "targets": [
            {
-              "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+              "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
              "refId": "A",
              "instant": true
            }
@ -1242,7 +1243,8 @@ data:
                "order": "desc"
              }
            }
-          ]
+          ],
          "description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
        }
      ],
      "time": {
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -1686,7 +1686,7 @@ data:
        {
          "id": 42,
          "type": "timeseries",
-          "title": "Ariadne Test Success Rate",
+          "title": "Platform Test Success Rate",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
+              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
              "refId": "A"
            }
          ],
@ -1718,12 +1718,13 @@ data:
            "tooltip": {
              "mode": "multi"
            }
-          }
+          },
          "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
        },
        {
          "id": 43,
          "type": "bargauge",
-          "title": "Tests with Failures (24h)",
+          "title": "Platform Tests with Failures (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1736,7 +1737,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
+              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
              "refId": "A",
              "legendFormat": "{{result}}",
              "instant": true
@ -1823,7 +1824,8 @@ data:
                "order": "desc"
              }
            }
-          ]
+          ],
          "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
        },
        {
          "id": 11,
@ -1910,7 +1912,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -286,7 +286,7 @@ spec:
    podAnnotations:
      vault.hashicorp.com/agent-inject: "true"
      vault.hashicorp.com/role: "monitoring"
-      monitoring.bstein.dev/restart-rev: "1"
+      monitoring.bstein.dev/restart-rev: "6"
      vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
      vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
        {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@ -43,6 +43,12 @@ spec:
                  value: /var/run/secrets/vault-token-reviewer/token
                - name: VAULT_K8S_ROLE_TTL
                  value: 1h
                - name: VAULT_K8S_BOUND_AUDIENCES
                  value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
                - name: VAULT_K8S_ISSUER
                  value: https://kubernetes.default.svc.cluster.local
                - name: VAULT_K8S_DISABLE_ISS_VALIDATION
                  value: "false"
              volumeMounts:
                - name: k8s-auth-config-script
                  mountPath: /scripts
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@ -53,6 +53,8 @@ ensure_token
 k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
 k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
 k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
 k8s_issuer="${VAULT_K8S_ISSUER:-}"
 disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
 role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
 token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"
@ -68,11 +70,36 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
  vault_cmd auth enable kubernetes
 fi
 ensure_default_policy_login() {
  default_policy="$(vault_cmd policy read default)"
  if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
    return
  fi
  log "updating default policy to allow kubernetes login"
  default_policy="${default_policy}
 path \"auth/kubernetes/login\" {
  capabilities = [\"create\", \"update\"]
 }
 "
  printf '%s\n' "${default_policy}" | vault_cmd policy write default -
 }
 log "configuring kubernetes auth"
-vault_cmd write auth/kubernetes/config \
+if [ -n "${k8s_issuer}" ]; then
  vault_cmd write auth/kubernetes/config \
    token_reviewer_jwt="${token_reviewer_jwt}" \
    kubernetes_host="${k8s_host}" \
    kubernetes_ca_cert="${k8s_ca}" \
    issuer="${k8s_issuer}" \
    disable_iss_validation="${disable_iss_validation}"
 else
  vault_cmd write auth/kubernetes/config \
    token_reviewer_jwt="${token_reviewer_jwt}" \
    kubernetes_host="${k8s_host}" \
    kubernetes_ca_cert="${k8s_ca}"
 fi
 ensure_default_policy_login
 write_raw_policy() {
  name="$1"
@ -87,6 +114,7 @@ write_policy_and_role() {
  service_accounts="$3"
  read_paths="$4"
  write_paths="$5"
  audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"
  policy_body=""
  for path in ${read_paths}; do
@ -109,11 +137,42 @@ path \"kv/metadata/atlas/${path}\" {
 }
 "
  done
  if [ "${role}" = "maintenance" ]; then
    policy_body="${policy_body}
 path \"sys/auth\" {
  capabilities = [\"read\"]
 }
 path \"sys/auth/*\" {
  capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
 }
 path \"auth/kubernetes/*\" {
  capabilities = [\"create\", \"update\", \"read\"]
 }
 path \"auth/oidc/*\" {
  capabilities = [\"create\", \"update\", \"read\"]
 }
 path \"sys/policies/acl\" {
  capabilities = [\"list\"]
 }
 path \"sys/policies/acl/*\" {
  capabilities = [\"create\", \"update\", \"read\"]
 }
 "
  fi
  log "writing policy ${role}"
  printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -
  log "writing role ${role}"
  if [ -n "${audiences}" ]; then
    vault_cmd write "auth/kubernetes/role/${role}" \
      bound_service_account_audiences="${audiences}" \
      bound_service_account_names="${service_accounts}" \
      bound_service_account_namespaces="${namespace}" \
      policies="${role}" \
      ttl="${role_ttl}"
    return
  fi
  vault_cmd write "auth/kubernetes/role/${role}" \
    bound_service_account_names="${service_accounts}" \
    bound_service_account_namespaces="${namespace}" \
@ -218,6 +277,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
  "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
 write_policy_and_role "ai" "ai" "atlasbot" \
  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
 write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
  "jenkins/* shared/harbor-pull" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
@ -231,7 +292,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
  "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
+  "maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
  "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \
		`@ -0,0 +1,3 @@`
							`FROM python:3.11-slim`

							`RUN pip install --no-cache-dir psycopg2-binary bcrypt`