maintenance: harden metis recovery and fix harbor rollout

maintenance/jenkins: align Metis ingress, sentinel push, and CI job
maintenance: add Metis service and sentinel manifests
2026-03-31 14:51:49 -03:00 · 2026-03-31 14:21:53 -03:00 · 2026-03-31 14:07:17 -03:00 · 2026-03-31 13:54:04 -03:00 · 2026-03-30 18:41:21 -03:00 · 2026-03-30 18:40:59 -03:00
94 changed files with 8133 additions and 383 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 !README.md
 !knowledge/**/*.md
 !services/comms/knowledge/**/*.md
+!services/atlasbot/knowledge/**/*.md
 __pycache__/
 *.py[cod]
 .pytest_cache
--- a/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
@ -0,0 +1,26 @@
+# clusters/atlas/flux-system/applications/atlasbot/image-automation.yaml
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageUpdateAutomation
+metadata:
+  name: atlasbot
+  namespace: ai
+spec:
+  interval: 1m0s
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  git:
+    checkout:
+      ref:
+        branch: feature/atlasbot
+    commit:
+      author:
+        email: ops@bstein.dev
+        name: flux-bot
+      messageTemplate: "chore(atlasbot): automated image update"
+    push:
+      branch: feature/atlasbot
+  update:
+    strategy: Setters
+    path: services/atlasbot
--- a/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
@ -0,0 +1,17 @@
+# clusters/atlas/flux-system/applications/atlasbot/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: atlasbot
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  path: ./services/atlasbot
+  targetNamespace: ai
+  timeout: 2m
+  dependsOn:
+    - name: ai-llm
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/ariadne
+        branch: feature/atlasbot
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(bstein-dev-home): automated image update"
    push:
-      branch: feature/ariadne
+      branch: feature/atlasbot
  update:
    strategy: Setters
    path: services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/comms/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/comms/image-automation.yaml
@ -0,0 +1,26 @@
+# clusters/atlas/flux-system/applications/comms/image-automation.yaml
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageUpdateAutomation
+metadata:
+  name: comms
+  namespace: comms
+spec:
+  interval: 1m0s
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  git:
+    checkout:
+      ref:
+        branch: feature/atlasbot
+    commit:
+      author:
+        email: ops@bstein.dev
+        name: flux-bot
+      messageTemplate: "chore(comms): automated image update"
+    push:
+      branch: feature/atlasbot
+  update:
+    strategy: Setters
+    path: services/comms
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -6,6 +6,9 @@ resources:
  - vault/kustomization.yaml
  - vaultwarden/kustomization.yaml
  - comms/kustomization.yaml
+  - comms/image-automation.yaml
+  - atlasbot/kustomization.yaml
+  - atlasbot/image-automation.yaml
  - crypto/kustomization.yaml
  - monerod/kustomization.yaml
  - pegasus/kustomization.yaml
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@ -9,7 +9,7 @@ metadata:
 spec:
  interval: 1m0s
  ref:
-    branch: feature/ariadne
+    branch: feature/atlasbot
  secretRef:
    name: flux-system-gitea
  url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@ -16,5 +16,6 @@ resources:
  - longhorn/kustomization.yaml
  - longhorn-ui/kustomization.yaml
  - postgres/kustomization.yaml
+  - nats/kustomization.yaml
  - ../platform/vault-csi/kustomization.yaml
  - ../platform/vault-injector/kustomization.yaml
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/ariadne
+        branch: feature/atlasbot
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
      messageTemplate: "chore(maintenance): automated image update"
    push:
-      branch: feature/ariadne
+      branch: feature/atlasbot
  update:
    strategy: Setters
    path: services/maintenance
--- a/clusters/atlas/flux-system/platform/nats/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/nats/kustomization.yaml
@ -0,0 +1,21 @@
+# clusters/atlas/flux-system/platform/nats/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: nats
+  namespace: flux-system
+spec:
+  interval: 10m
+  path: ./infrastructure/nats
+  prune: true
+  force: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  targetNamespace: nats
+  healthChecks:
+    - apiVersion: apps/v1
+      kind: StatefulSet
+      name: nats
+      namespace: nats
+  wait: true
--- a/dockerfiles/Dockerfile.synapse-admin-ensure
+++ b/dockerfiles/Dockerfile.synapse-admin-ensure
@ -0,0 +1,3 @@
+FROM python:3.11-slim
+
+RUN pip install --no-cache-dir psycopg2-binary bcrypt
--- a/infrastructure/core/kustomization.yaml
+++ b/infrastructure/core/kustomization.yaml
@ -6,6 +6,7 @@ resources:
  - ../modules/profiles/atlas-ha
  - coredns-custom.yaml
  - coredns-deployment.yaml
+  - longhorn-node-taints.yaml
  - ntp-sync-daemonset.yaml
  - ../sources/cert-manager/letsencrypt.yaml
  - ../sources/cert-manager/letsencrypt-prod.yaml
--- a/infrastructure/core/longhorn-node-taints.yaml
+++ b/infrastructure/core/longhorn-node-taints.yaml
@ -0,0 +1,40 @@
+# infrastructure/core/longhorn-node-taints.yaml
+apiVersion: v1
+kind: Node
+metadata:
+  name: titan-13
+spec:
+  taints:
+    - key: longhorn
+      value: "true"
+      effect: PreferNoSchedule
+---
+apiVersion: v1
+kind: Node
+metadata:
+  name: titan-15
+spec:
+  taints:
+    - key: longhorn
+      value: "true"
+      effect: PreferNoSchedule
+---
+apiVersion: v1
+kind: Node
+metadata:
+  name: titan-17
+spec:
+  taints:
+    - key: longhorn
+      value: "true"
+      effect: PreferNoSchedule
+---
+apiVersion: v1
+kind: Node
+metadata:
+  name: titan-19
+spec:
+  taints:
+    - key: longhorn
+      value: "true"
+      effect: PreferNoSchedule
--- a/infrastructure/longhorn/core/backup-target.yaml
+++ b/infrastructure/longhorn/core/backup-target.yaml
@ -0,0 +1,10 @@
+# infrastructure/longhorn/core/backup-target.yaml
+apiVersion: longhorn.io/v1beta2
+kind: BackupTarget
+metadata:
+  name: default
+  namespace: longhorn-system
+spec:
+  backupTargetURL: "s3://atlas-soteria@us-west-004/"
+  credentialSecret: longhorn-backup-b2
+  pollInterval: 5m0s
--- a/infrastructure/longhorn/core/helmrelease.yaml
+++ b/infrastructure/longhorn/core/helmrelease.yaml
@ -6,6 +6,39 @@ metadata:
  namespace: longhorn-system
 spec:
  interval: 30m
+  postRenderers:
+    - kustomize:
+        patches:
+          - target:
+              kind: Service
+              name: longhorn-conversion-webhook
+              namespace: longhorn-system
+            patch: |
+              - op: add
+                path: /spec/publishNotReadyAddresses
+                value: true
+          - target:
+              kind: Service
+              name: longhorn-admission-webhook
+              namespace: longhorn-system
+            patch: |
+              - op: add
+                path: /spec/publishNotReadyAddresses
+                value: true
+          - target:
+              kind: DaemonSet
+              name: longhorn-manager
+              namespace: longhorn-system
+            patch: |
+              - op: replace
+                path: /spec/template/spec/containers/0/readinessProbe/httpGet/path
+                value: /v1/healthz
+              - op: replace
+                path: /spec/template/spec/containers/0/readinessProbe/httpGet/port
+                value: 9500
+              - op: replace
+                path: /spec/template/spec/containers/0/readinessProbe/httpGet/scheme
+                value: HTTP
  chart:
    spec:
      chart: longhorn
@ -34,7 +67,7 @@ spec:
      createSecret: false
      registrySecret: longhorn-registry
    image:
-      pullPolicy: Always
+      pullPolicy: IfNotPresent
      longhorn:
        engine:
          repository: registry.bstein.dev/infra/longhorn-engine
@ -77,4 +110,4 @@ spec:
          repository: registry.bstein.dev/infra/longhorn-livenessprobe
          tag: v2.16.0
    defaultSettings:
-      systemManagedPodsImagePullPolicy: Always
+      systemManagedPodsImagePullPolicy: IfNotPresent
--- a/infrastructure/longhorn/core/kustomization.yaml
+++ b/infrastructure/longhorn/core/kustomization.yaml
@ -6,6 +6,7 @@ resources:
  - vault-serviceaccount.yaml
  - secretproviderclass.yaml
  - vault-sync-deployment.yaml
+  - backup-target.yaml
  - helmrelease.yaml
  - longhorn-settings-ensure-job.yaml

--- a/infrastructure/longhorn/core/secretproviderclass.yaml
+++ b/infrastructure/longhorn/core/secretproviderclass.yaml
@ -13,9 +13,27 @@ spec:
      - objectName: "harbor-pull__dockerconfigjson"
        secretPath: "kv/data/atlas/shared/harbor-pull"
        secretKey: "dockerconfigjson"
+      - objectName: "longhorn_backup__AWS_ACCESS_KEY_ID"
+        secretPath: "kv/data/atlas/longhorn/backup-b2"
+        secretKey: "AWS_ACCESS_KEY_ID"
+      - objectName: "longhorn_backup__AWS_SECRET_ACCESS_KEY"
+        secretPath: "kv/data/atlas/longhorn/backup-b2"
+        secretKey: "AWS_SECRET_ACCESS_KEY"
+      - objectName: "longhorn_backup__AWS_ENDPOINTS"
+        secretPath: "kv/data/atlas/longhorn/backup-b2"
+        secretKey: "AWS_ENDPOINTS"
  secretObjects:
    - secretName: longhorn-registry
      type: kubernetes.io/dockerconfigjson
      data:
        - objectName: harbor-pull__dockerconfigjson
          key: .dockerconfigjson
+    - secretName: longhorn-backup-b2
+      type: Opaque
+      data:
+        - objectName: longhorn_backup__AWS_ACCESS_KEY_ID
+          key: AWS_ACCESS_KEY_ID
+        - objectName: longhorn_backup__AWS_SECRET_ACCESS_KEY
+          key: AWS_SECRET_ACCESS_KEY
+        - objectName: longhorn_backup__AWS_ENDPOINTS
+          key: AWS_ENDPOINTS
--- a/infrastructure/nats/configmap.yaml
+++ b/infrastructure/nats/configmap.yaml
@ -0,0 +1,17 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nats-config
+  namespace: nats
+  labels:
+    app: nats
+    component: config
+  annotations:
+    description: "NATS JetStream configuration"
+data:
+  nats.conf: |
+    jetstream {
+      store_dir: /data
+      max_mem_store: 128MB
+      max_file_store: 1GB
+    }
--- a/infrastructure/nats/kustomization.yaml
+++ b/infrastructure/nats/kustomization.yaml
@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - namespace.yaml
+  - configmap.yaml
+  - service.yaml
+  - statefulset.yaml
--- a/infrastructure/nats/namespace.yaml
+++ b/infrastructure/nats/namespace.yaml
@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: nats
--- a/infrastructure/nats/service.yaml
+++ b/infrastructure/nats/service.yaml
@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: nats
+  namespace: nats
+  labels:
+    app: nats
+spec:
+  selector:
+    app: nats
+  ports:
+    - name: client
+      port: 4222
+      targetPort: 4222
+    - name: monitoring
+      port: 8222
+      targetPort: 8222
--- a/infrastructure/nats/statefulset.yaml
+++ b/infrastructure/nats/statefulset.yaml
@ -0,0 +1,54 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: nats
+  namespace: nats
+  labels:
+    app: nats
+spec:
+  serviceName: nats
+  replicas: 1
+  selector:
+    matchLabels:
+      app: nats
+  template:
+    metadata:
+      labels:
+        app: nats
+    spec:
+      containers:
+        - name: nats
+          image: nats:2.10.18
+          args:
+            - "-c"
+            - "/etc/nats/nats.conf"
+          ports:
+            - name: client
+              containerPort: 4222
+            - name: monitoring
+              containerPort: 8222
+          volumeMounts:
+            - name: config
+              mountPath: /etc/nats
+            - name: data
+              mountPath: /data
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+      volumes:
+        - name: config
+          configMap:
+            name: nats-config
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes:
+          - ReadWriteOnce
+        resources:
+          requests:
+            storage: 2Gi
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -47,6 +47,7 @@ PERCENT_THRESHOLDS = {
 }

 NAMESPACE_CPU_WINDOW = "1m"
+GPU_RESOURCE_REGEX = r"nvidia[.]com/gpu.*|nvidia_com_gpu.*"

 # ---------------------------------------------------------------------------
 # Cluster metadata
@ -235,13 +236,16 @@ def gpu_util_by_hostname():


 def gpu_node_labels():
-    return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
+    return (
+        f'(max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0))'
+        ' or kube_node_labels{label_jetson="true"}'
+    )


 def gpu_requests_by_namespace_node(scope_var):
    return (
        "sum by (namespace,node) ("
-        f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
+        f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
        "* on(namespace,pod) group_left(node) kube_pod_info "
        f"* on(node) group_left() ({gpu_node_labels()})"
        ")"
@ -253,7 +257,7 @@ def gpu_usage_by_namespace(scope_var):
    total_by_node = f"sum by (node) ({requests_by_ns})"
    return (
        "sum by (namespace) ("
-        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
+        f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
        f"* on(node) group_left() ({gpu_util_by_node()})"
        ")"
    )
@ -419,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
    "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
 )
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
-ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
-ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
-ARIADNE_TEST_SUCCESS_RATE = (
+TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
+TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
+TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
+TEST_SUCCESS_RATE = (
    "100 * "
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
+    f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
    "/ clamp_min("
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
+    f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
 )
-ARIADNE_TEST_FAILURES_24H = (
-    'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
+TEST_FAILURES_24H = (
+    f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
 )
 POSTGRES_CONN_USED = (
    'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1290,23 +1295,25 @@ def build_overview():
            },
        }
    )
-    panels.append(
-        timeseries_panel(
+    test_success = timeseries_panel(
        42,
-            "Ariadne Test Success Rate",
-            ARIADNE_TEST_SUCCESS_RATE,
+        "Platform Test Success Rate",
+        TEST_SUCCESS_RATE,
        {"h": 6, "w": 6, "x": 12, "y": 14},
        unit="percent",
        max_value=100,
        legend=None,
        legend_display="list",
    )
+    test_success["description"] = (
+        "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
+        "Add new test series there first so they roll up here."
    )
-    panels.append(
-        bargauge_panel(
+    panels.append(test_success)
+    test_failures = bargauge_panel(
        43,
-            "Tests with Failures (24h)",
-            ARIADNE_TEST_FAILURES_24H,
+        "Platform Tests with Failures (24h)",
+        TEST_FAILURES_24H,
        {"h": 6, "w": 6, "x": 18, "y": 14},
        unit="none",
        instant=True,
@ -1331,7 +1338,10 @@ def build_overview():
            ],
        },
    )
+    test_failures["description"] = (
+        "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
    )
+    panels.append(test_failures)

    cpu_scope = "$namespace_scope_cpu"
    gpu_scope = "$namespace_scope_gpu"
@ -2649,29 +2659,31 @@ def build_jobs_dashboard():
            legend="{{status}}",
        )
    )
-    panels.append(
-        stat_panel(
+    coverage_panel = stat_panel(
        17,
-            "Ariadne CI Coverage (%)",
-            ARIADNE_CI_COVERAGE,
+        "Platform CI Coverage (%)",
+        TEST_CI_COVERAGE,
        {"h": 6, "w": 4, "x": 8, "y": 11},
        unit="percent",
        decimals=1,
        instant=True,
        legend="{{branch}}",
    )
-    )
-    panels.append(
-        table_panel(
+    coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups."
+    panels.append(coverage_panel)
+    tests_panel = table_panel(
        18,
-            "Ariadne CI Tests (latest)",
-            ARIADNE_CI_TESTS,
+        "Platform CI Tests (latest)",
+        TEST_CI_TESTS,
        {"h": 6, "w": 12, "x": 12, "y": 11},
        unit="none",
        transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
        instant=True,
    )
+    tests_panel["description"] = (
+        "Atlas Overview test panels depend on these internal repo-tagged CI series."
    )
+    panels.append(tests_panel)

    return {
        "uid": "atlas-jobs",
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@ -539,9 +539,9 @@ def main() -> int:
        help="Write generated files (otherwise just print a summary).",
    )
    ap.add_argument(
-        "--sync-comms",
+        "--sync-atlasbot",
        action="store_true",
-        help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
+        help="Mirror rendered knowledge into services/atlasbot/knowledge for atlasbot.",
    )
    args = ap.parse_args()

@ -632,10 +632,10 @@ def main() -> int:
    print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")

-    if args.sync_comms:
-        comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
-        _sync_tree(out_dir, comms_dir)
-        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
+    if args.sync_atlasbot:
+        atlasbot_dir = REPO_ROOT / "services" / "atlasbot" / "knowledge"
+        _sync_tree(out_dir, atlasbot_dir)
+        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {atlasbot_dir.relative_to(REPO_ROOT)}")
    return 0


--- a/services/atlasbot/atlasbot-deployment.yaml
+++ b/services/atlasbot/atlasbot-deployment.yaml
@ -3,7 +3,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: atlasbot
-  namespace: comms
+  namespace: ai
  labels:
    app: atlasbot
 spec:
@ -18,7 +18,7 @@ spec:
      annotations:
        checksum/atlasbot-configmap: manual-atlasbot-101
        vault.hashicorp.com/agent-inject: "true"
-        vault.hashicorp.com/role: "comms"
+        vault.hashicorp.com/role: "ai"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
        vault.hashicorp.com/agent-inject-template-turn-secret: |
          {{- with secret "kv/data/atlas/comms/turn-shared-secret" -}}{{ .Data.data.TURN_STATIC_AUTH_SECRET }}{{- end -}}
@ -28,6 +28,15 @@ spec:
        vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
+        vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
+          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
+        vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
+          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-bot-genius-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
+        vault.hashicorp.com/agent-inject-template-bot-genius-pass: |
+          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-genius-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-seeder-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -58,17 +67,17 @@ spec:
        hardware: rpi5
      containers:
        - name: atlasbot
-          image: python:3.11-slim
+          image: registry.bstein.dev/bstein/atlasbot:0.1.0-55
          command: ["/bin/sh","-c"]
          args:
            - |
-              . /vault/scripts/comms_vault_env.sh
-              exec python /app/bot.py
+              . /vault/scripts/atlasbot_vault_env.sh
+              exec python -m atlasbot.main
          env:
            - name: MATRIX_BASE
-              value: http://othrys-synapse-matrix-synapse:8008
+              value: http://othrys-synapse-matrix-synapse.comms.svc.cluster.local:8008
            - name: AUTH_BASE
-              value: http://matrix-authentication-service:8080
+              value: http://matrix-authentication-service.comms.svc.cluster.local:8080
            - name: KB_DIR
              value: /kb
            - name: VM_URL
@ -76,27 +85,69 @@ spec:
            - name: ARIADNE_STATE_URL
              value: http://ariadne.maintenance.svc.cluster.local/api/internal/cluster/state
            - name: BOT_USER
-              value: atlasbot
+              value: atlas-smart
+            - name: BOT_USER_QUICK
+              value: atlas-quick
+            - name: BOT_USER_SMART
+              value: atlas-smart
+            - name: BOT_USER_GENIUS
+              value: atlas-genius
            - name: BOT_MENTIONS
-              value: atlasbot,aatlasbot,atlas_quick,atlas_smart
+              value: atlas-quick,atlas-smart,atlas-genius
            - name: OLLAMA_URL
              value: http://ollama.ai.svc.cluster.local:11434
            - name: OLLAMA_MODEL
-              value: qwen2.5:14b-instruct
-            - name: ATLASBOT_MODEL_FAST
              value: qwen2.5:14b-instruct-q4_0
-            - name: ATLASBOT_MODEL_DEEP
-              value: qwen2.5:14b-instruct
+            - name: ATLASBOT_MODEL_FAST
+              value: qwen2.5-coder:7b-instruct-q4_0
+            - name: ATLASBOT_MODEL_SMART
+              value: qwen2.5:14b-instruct-q4_0
+            - name: ATLASBOT_MODEL_GENIUS
+              value: qwen2.5:14b-instruct-q4_0
            - name: OLLAMA_FALLBACK_MODEL
              value: qwen2.5:14b-instruct-q4_0
            - name: OLLAMA_TIMEOUT_SEC
              value: "600"
+            - name: OLLAMA_RETRIES
+              value: "0"
            - name: ATLASBOT_THINKING_INTERVAL_SEC
-              value: "120"
+              value: "30"
+            - name: ATLASBOT_QUICK_TIME_BUDGET_SEC
+              value: "15"
+            - name: ATLASBOT_SMART_TIME_BUDGET_SEC
+              value: "45"
+            - name: ATLASBOT_GENIUS_TIME_BUDGET_SEC
+              value: "180"
            - name: ATLASBOT_SNAPSHOT_TTL_SEC
              value: "30"
            - name: ATLASBOT_HTTP_PORT
              value: "8090"
+            - name: ATLASBOT_STATE_DB
+              value: /data/atlasbot_state.db
+            - name: ATLASBOT_QUEUE_ENABLED
+              value: "false"
+            - name: ATLASBOT_DEBUG_PIPELINE
+              value: "true"
+            - name: ATLASBOT_NATS_URL
+              value: nats://nats.nats.svc.cluster.local:4222
+            - name: ATLASBOT_NATS_STREAM
+              value: atlasbot
+            - name: ATLASBOT_NATS_SUBJECT
+              value: atlasbot.requests
+            - name: ATLASBOT_FAST_MAX_ANGLES
+              value: "2"
+            - name: ATLASBOT_SMART_MAX_ANGLES
+              value: "5"
+            - name: ATLASBOT_FAST_MAX_CANDIDATES
+              value: "2"
+            - name: ATLASBOT_SMART_MAX_CANDIDATES
+              value: "6"
+            - name: ATLASBOT_FAST_LLM_CALLS_MAX
+              value: "8"
+            - name: ATLASBOT_SMART_LLM_CALLS_MAX
+              value: "24"
+            - name: ATLASBOT_GENIUS_LLM_CALLS_MAX
+              value: "72"
          ports:
            - name: http
              containerPort: 8090
@ -108,19 +159,15 @@ spec:
              cpu: 500m
              memory: 512Mi
          volumeMounts:
-            - name: code
-              mountPath: /app/bot.py
-              subPath: bot.py
            - name: kb
              mountPath: /kb
              readOnly: true
            - name: vault-scripts
              mountPath: /vault/scripts
              readOnly: true
+            - name: atlasbot-state
+              mountPath: /data
      volumes:
-        - name: code
-          configMap:
-            name: atlasbot
        - name: kb
          configMap:
            name: atlas-kb
@ -139,5 +186,7 @@ spec:
                path: diagrams/atlas-http.mmd
        - name: vault-scripts
          configMap:
-            name: comms-vault-env
+            name: atlasbot-vault-env
            defaultMode: 0555
+        - name: atlasbot-state
+          emptyDir: {}
--- a/services/atlasbot/atlasbot-rbac.yaml
+++ b/services/atlasbot/atlasbot-rbac.yaml
@ -3,7 +3,9 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: atlasbot
-  namespace: comms
+  namespace: ai
+imagePullSecrets:
+  - name: harbor-regcred
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
@ -43,5 +45,4 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: atlasbot
-    namespace: comms
-
+    namespace: ai
--- a/services/atlasbot/atlasbot-service.yaml
+++ b/services/atlasbot/atlasbot-service.yaml
@ -2,7 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
  name: atlasbot
-  namespace: comms
+  namespace: ai
  labels:
    app: atlasbot
 spec:
--- a/services/atlasbot/image-automation.yaml
+++ b/services/atlasbot/image-automation.yaml
@ -0,0 +1,26 @@
+# services/atlasbot/image-automation.yaml
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageUpdateAutomation
+metadata:
+  name: atlasbot
+  namespace: ai
+spec:
+  interval: 1m0s
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  git:
+    checkout:
+      ref:
+        branch: feature/atlasbot
+    commit:
+      author:
+        name: flux-bot
+        email: ops@bstein.dev
+      messageTemplate: "chore(atlasbot): automated image update"
+    push:
+      branch: feature/atlasbot
+  update:
+    path: services/atlasbot
+    strategy: Setters
--- a/services/atlasbot/image.yaml
+++ b/services/atlasbot/image.yaml
@ -0,0 +1,23 @@
+# services/comms/image.yaml
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImageRepository
+metadata:
+  name: atlasbot
+  namespace: ai
+spec:
+  image: registry.bstein.dev/bstein/atlasbot
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImagePolicy
+metadata:
+  name: atlasbot
+  namespace: ai
+spec:
+  imageRepositoryRef:
+    name: atlasbot
+  policy:
+    semver:
+      range: ">=0.1.0-0"
--- a/services/atlasbot/knowledge/INDEX.md
+++ b/services/atlasbot/knowledge/INDEX.md
@ -0,0 +1,22 @@
+Atlas Knowledge Base (KB)
+
+This folder is the source-of-truth “memory” for Atlas/Titan assistants (and for humans). It is designed to be:
+- Accurate (grounded in GitOps + read-only cluster tools)
+- Maintainable (small docs + deterministic generators)
+- Safe (no secrets; refer to Secret/Vault paths by name only)
+
+Layout
+- `knowledge/runbooks/`: human-written docs (short, chunkable Markdown).
+- `knowledge/catalog/`: generated machine-readable facts (YAML/JSON).
+- `knowledge/diagrams/`: generated Mermaid diagrams (`.mmd`) derived from the catalog.
+
+Regeneration
+- Update manifests/docs, then regenerate generated artifacts:
+  - `python scripts/knowledge_render_atlas.py --write`
+
+Authoring rules
+- Never include secret values. Prefer `secretRef` names or Vault paths like `kv/atlas/...`.
+- Prefer stable identifiers: Kubernetes `namespace/name`, DNS hostnames, Flux kustomization paths.
+- Keep each runbook small; one topic per file; use headings.
+- When in doubt, link to the exact file path in this repo that configures the behavior.
+
--- a/services/atlasbot/knowledge/catalog/atlas-summary.json
+++ b/services/atlasbot/knowledge/catalog/atlas-summary.json
@ -0,0 +1,8 @@
+{
+  "counts": {
+    "helmrelease_host_hints": 19,
+    "http_endpoints": 45,
+    "services": 47,
+    "workloads": 74
+  }
+}
--- a/services/atlasbot/knowledge/catalog/atlas.json
+++ b/services/atlasbot/knowledge/catalog/atlas.json
--- a/services/atlasbot/knowledge/catalog/metrics.json
+++ b/services/atlasbot/knowledge/catalog/metrics.json
--- a/services/atlasbot/knowledge/catalog/runbooks.json
+++ b/services/atlasbot/knowledge/catalog/runbooks.json
--- a/services/atlasbot/knowledge/diagrams/atlas-http.mmd
+++ b/services/atlasbot/knowledge/diagrams/atlas-http.mmd
@ -0,0 +1,234 @@
+flowchart LR
+  host_auth_bstein_dev["auth.bstein.dev"]
+  svc_sso_oauth2_proxy["sso/oauth2-proxy (Service)"]
+  host_auth_bstein_dev --> svc_sso_oauth2_proxy
+  wl_sso_oauth2_proxy["sso/oauth2-proxy (Deployment)"]
+  svc_sso_oauth2_proxy --> wl_sso_oauth2_proxy
+  host_bstein_dev["bstein.dev"]
+  svc_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Service)"]
+  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_frontend
+  wl_bstein_dev_home_bstein_dev_home_frontend["bstein-dev-home/bstein-dev-home-frontend (Deployment)"]
+  svc_bstein_dev_home_bstein_dev_home_frontend --> wl_bstein_dev_home_bstein_dev_home_frontend
+  svc_comms_matrix_wellknown["comms/matrix-wellknown (Service)"]
+  host_bstein_dev --> svc_comms_matrix_wellknown
+  wl_comms_matrix_wellknown["comms/matrix-wellknown (Deployment)"]
+  svc_comms_matrix_wellknown --> wl_comms_matrix_wellknown
+  svc_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Service)"]
+  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
+  wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
+  svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
+  host_budget_bstein_dev["budget.bstein.dev"]
+  svc_finance_actual_budget["finance/actual-budget (Service)"]
+  host_budget_bstein_dev --> svc_finance_actual_budget
+  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
+  svc_finance_actual_budget --> wl_finance_actual_budget
+  host_call_live_bstein_dev["call.live.bstein.dev"]
+  svc_comms_element_call["comms/element-call (Service)"]
+  host_call_live_bstein_dev --> svc_comms_element_call
+  wl_comms_element_call["comms/element-call (Deployment)"]
+  svc_comms_element_call --> wl_comms_element_call
+  host_chat_ai_bstein_dev["chat.ai.bstein.dev"]
+  svc_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Service)"]
+  host_chat_ai_bstein_dev --> svc_bstein_dev_home_chat_ai_gateway
+  wl_bstein_dev_home_chat_ai_gateway["bstein-dev-home/chat-ai-gateway (Deployment)"]
+  svc_bstein_dev_home_chat_ai_gateway --> wl_bstein_dev_home_chat_ai_gateway
+  host_ci_bstein_dev["ci.bstein.dev"]
+  svc_jenkins_jenkins["jenkins/jenkins (Service)"]
+  host_ci_bstein_dev --> svc_jenkins_jenkins
+  wl_jenkins_jenkins["jenkins/jenkins (Deployment)"]
+  svc_jenkins_jenkins --> wl_jenkins_jenkins
+  host_cloud_bstein_dev["cloud.bstein.dev"]
+  svc_nextcloud_nextcloud["nextcloud/nextcloud (Service)"]
+  host_cloud_bstein_dev --> svc_nextcloud_nextcloud
+  wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
+  svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
+  host_health_bstein_dev["health.bstein.dev"]
+  svc_health_wger["health/wger (Service)"]
+  host_health_bstein_dev --> svc_health_wger
+  wl_health_wger["health/wger (Deployment)"]
+  svc_health_wger --> wl_health_wger
+  host_kit_live_bstein_dev["kit.live.bstein.dev"]
+  svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
+  host_kit_live_bstein_dev --> svc_comms_livekit_token_service
+  wl_comms_livekit_token_service["comms/livekit-token-service (Deployment)"]
+  svc_comms_livekit_token_service --> wl_comms_livekit_token_service
+  svc_comms_livekit["comms/livekit (Service)"]
+  host_kit_live_bstein_dev --> svc_comms_livekit
+  wl_comms_livekit["comms/livekit (Deployment)"]
+  svc_comms_livekit --> wl_comms_livekit
+  host_live_bstein_dev["live.bstein.dev"]
+  host_live_bstein_dev --> svc_comms_matrix_wellknown
+  svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
+  host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
+  host_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
+  host_logs_bstein_dev["logs.bstein.dev"]
+  svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
+  host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
+  wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
+  svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
+  host_longhorn_bstein_dev["longhorn.bstein.dev"]
+  svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
+  host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
+  wl_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Deployment)"]
+  svc_longhorn_system_oauth2_proxy_longhorn --> wl_longhorn_system_oauth2_proxy_longhorn
+  host_mail_bstein_dev["mail.bstein.dev"]
+  svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
+  host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
+  host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
+  host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
+  host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
+  host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
+  host_monero_bstein_dev["monero.bstein.dev"]
+  svc_crypto_monerod["crypto/monerod (Service)"]
+  host_monero_bstein_dev --> svc_crypto_monerod
+  wl_crypto_monerod["crypto/monerod (Deployment)"]
+  svc_crypto_monerod --> wl_crypto_monerod
+  host_money_bstein_dev["money.bstein.dev"]
+  svc_finance_firefly["finance/firefly (Service)"]
+  host_money_bstein_dev --> svc_finance_firefly
+  wl_finance_firefly["finance/firefly (Deployment)"]
+  svc_finance_firefly --> wl_finance_firefly
+  host_notes_bstein_dev["notes.bstein.dev"]
+  svc_outline_outline["outline/outline (Service)"]
+  host_notes_bstein_dev --> svc_outline_outline
+  wl_outline_outline["outline/outline (Deployment)"]
+  svc_outline_outline --> wl_outline_outline
+  host_office_bstein_dev["office.bstein.dev"]
+  svc_nextcloud_collabora["nextcloud/collabora (Service)"]
+  host_office_bstein_dev --> svc_nextcloud_collabora
+  wl_nextcloud_collabora["nextcloud/collabora (Deployment)"]
+  svc_nextcloud_collabora --> wl_nextcloud_collabora
+  host_pegasus_bstein_dev["pegasus.bstein.dev"]
+  svc_jellyfin_pegasus["jellyfin/pegasus (Service)"]
+  host_pegasus_bstein_dev --> svc_jellyfin_pegasus
+  wl_jellyfin_pegasus["jellyfin/pegasus (Deployment)"]
+  svc_jellyfin_pegasus --> wl_jellyfin_pegasus
+  host_scm_bstein_dev["scm.bstein.dev"]
+  svc_gitea_gitea["gitea/gitea (Service)"]
+  host_scm_bstein_dev --> svc_gitea_gitea
+  wl_gitea_gitea["gitea/gitea (Deployment)"]
+  svc_gitea_gitea --> wl_gitea_gitea
+  host_secret_bstein_dev["secret.bstein.dev"]
+  svc_vault_vault["vault/vault (Service)"]
+  host_secret_bstein_dev --> svc_vault_vault
+  wl_vault_vault["vault/vault (StatefulSet)"]
+  svc_vault_vault --> wl_vault_vault
+  host_sso_bstein_dev["sso.bstein.dev"]
+  svc_sso_keycloak["sso/keycloak (Service)"]
+  host_sso_bstein_dev --> svc_sso_keycloak
+  wl_sso_keycloak["sso/keycloak (Deployment)"]
+  svc_sso_keycloak --> wl_sso_keycloak
+  host_stream_bstein_dev["stream.bstein.dev"]
+  svc_jellyfin_jellyfin["jellyfin/jellyfin (Service)"]
+  host_stream_bstein_dev --> svc_jellyfin_jellyfin
+  wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
+  svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
+  host_tasks_bstein_dev["tasks.bstein.dev"]
+  svc_planka_planka["planka/planka (Service)"]
+  host_tasks_bstein_dev --> svc_planka_planka
+  wl_planka_planka["planka/planka (Deployment)"]
+  svc_planka_planka --> wl_planka_planka
+  host_vault_bstein_dev["vault.bstein.dev"]
+  svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
+  host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
+  wl_vaultwarden_vaultwarden["vaultwarden/vaultwarden (Deployment)"]
+  svc_vaultwarden_vaultwarden_service --> wl_vaultwarden_vaultwarden
+
+  subgraph bstein_dev_home[bstein-dev-home]
+    svc_bstein_dev_home_bstein_dev_home_frontend
+    wl_bstein_dev_home_bstein_dev_home_frontend
+    svc_bstein_dev_home_bstein_dev_home_backend
+    wl_bstein_dev_home_bstein_dev_home_backend
+    svc_bstein_dev_home_chat_ai_gateway
+    wl_bstein_dev_home_chat_ai_gateway
+  end
+  subgraph comms[comms]
+    svc_comms_matrix_wellknown
+    wl_comms_matrix_wellknown
+    svc_comms_element_call
+    wl_comms_element_call
+    svc_comms_livekit_token_service
+    wl_comms_livekit_token_service
+    svc_comms_livekit
+    wl_comms_livekit
+    svc_comms_othrys_synapse_matrix_synapse
+    svc_comms_matrix_guest_register
+    wl_comms_matrix_guest_register
+    svc_comms_matrix_authentication_service
+    wl_comms_matrix_authentication_service
+  end
+  subgraph crypto[crypto]
+    svc_crypto_monerod
+    wl_crypto_monerod
+  end
+  subgraph finance[finance]
+    svc_finance_actual_budget
+    wl_finance_actual_budget
+    svc_finance_firefly
+    wl_finance_firefly
+  end
+  subgraph gitea[gitea]
+    svc_gitea_gitea
+    wl_gitea_gitea
+  end
+  subgraph health[health]
+    svc_health_wger
+    wl_health_wger
+  end
+  subgraph jellyfin[jellyfin]
+    svc_jellyfin_pegasus
+    wl_jellyfin_pegasus
+    svc_jellyfin_jellyfin
+    wl_jellyfin_jellyfin
+  end
+  subgraph jenkins[jenkins]
+    svc_jenkins_jenkins
+    wl_jenkins_jenkins
+  end
+  subgraph logging[logging]
+    svc_logging_oauth2_proxy_logs
+    wl_logging_oauth2_proxy_logs
+  end
+  subgraph longhorn_system[longhorn-system]
+    svc_longhorn_system_oauth2_proxy_longhorn
+    wl_longhorn_system_oauth2_proxy_longhorn
+  end
+  subgraph mailu_mailserver[mailu-mailserver]
+    svc_mailu_mailserver_mailu_front
+  end
+  subgraph nextcloud[nextcloud]
+    svc_nextcloud_nextcloud
+    wl_nextcloud_nextcloud
+    svc_nextcloud_collabora
+    wl_nextcloud_collabora
+  end
+  subgraph outline[outline]
+    svc_outline_outline
+    wl_outline_outline
+  end
+  subgraph planka[planka]
+    svc_planka_planka
+    wl_planka_planka
+  end
+  subgraph sso[sso]
+    svc_sso_oauth2_proxy
+    wl_sso_oauth2_proxy
+    svc_sso_keycloak
+    wl_sso_keycloak
+  end
+  subgraph vault[vault]
+    svc_vault_vault
+    wl_vault_vault
+  end
+  subgraph vaultwarden[vaultwarden]
+    svc_vaultwarden_vaultwarden_service
+    wl_vaultwarden_vaultwarden
+  end
--- a/services/atlasbot/kustomization.yaml
+++ b/services/atlasbot/kustomization.yaml
@ -0,0 +1,29 @@
+# services/atlasbot/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: ai
+resources:
+  - atlasbot-deployment.yaml
+  - atlasbot-service.yaml
+  - atlasbot-rbac.yaml
+  - secretproviderclass.yaml
+  - vault-sync-deployment.yaml
+  - image.yaml
+  - image-automation.yaml
+images:
+  - name: registry.bstein.dev/bstein/atlasbot
+    newTag: 0.1.2-106 # {"$imagepolicy": "ai:atlasbot:tag"}
+configMapGenerator:
+  - name: atlasbot-vault-env
+    files:
+      - atlasbot_vault_env.sh=scripts/atlasbot_vault_env.sh
+    options:
+      disableNameSuffixHash: true
+  - name: atlas-kb
+    files:
+      - INDEX.md=knowledge/INDEX.md
+      - atlas.json=knowledge/catalog/atlas.json
+      - atlas-summary.json=knowledge/catalog/atlas-summary.json
+      - metrics.json=knowledge/catalog/metrics.json
+      - runbooks.json=knowledge/catalog/runbooks.json
+      - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
--- a/services/atlasbot/scripts/atlasbot_vault_env.sh
+++ b/services/atlasbot/scripts/atlasbot_vault_env.sh
@ -0,0 +1,44 @@
+#!/usr/bin/env sh
+set -eu
+
+vault_dir="/vault/secrets"
+
+read_secret() {
+  tr -d '\r\n' < "${vault_dir}/$1"
+}
+
+read_optional() {
+  if [ -f "${vault_dir}/$1" ]; then
+    tr -d '\r\n' < "${vault_dir}/$1"
+  else
+    printf ''
+  fi
+}
+
+export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
+export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"
+
+export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
+export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"
+
+export BOT_PASS="$(read_secret bot-pass)"
+export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
+export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
+export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
+if [ -z "${BOT_PASS_SMART}" ]; then
+  export BOT_PASS_SMART="${BOT_PASS}"
+fi
+if [ -z "${BOT_PASS_GENIUS}" ]; then
+  export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
+fi
+export SEEDER_PASS="$(read_secret seeder-pass)"
+
+export CHAT_API_KEY="$(read_secret chat-matrix)"
+export CHAT_API_HOMEPAGE="$(read_secret chat-homepage)"
+
+export MAS_ADMIN_CLIENT_SECRET_FILE="${vault_dir}/mas-admin-secret"
+export PGPASSWORD="$(read_secret synapse-db-pass)"
+
+export MAS_DB_PASSWORD="$(read_secret mas-db-pass)"
+export MATRIX_SHARED_SECRET="$(read_secret mas-matrix-shared)"
+export KEYCLOAK_CLIENT_SECRET="$(read_secret mas-kc-secret)"
--- a/services/atlasbot/secretproviderclass.yaml
+++ b/services/atlasbot/secretproviderclass.yaml
@ -0,0 +1,21 @@
+# services/atlasbot/secretproviderclass.yaml
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: atlasbot-vault
+  namespace: ai
+spec:
+  provider: vault
+  parameters:
+    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
+    roleName: "ai"
+    objects: |
+      - objectName: "harbor-pull__dockerconfigjson"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
+        secretKey: "dockerconfigjson"
+  secretObjects:
+    - secretName: harbor-regcred
+      type: kubernetes.io/dockerconfigjson
+      data:
+        - objectName: harbor-pull__dockerconfigjson
+          key: .dockerconfigjson
--- a/services/atlasbot/vault-sync-deployment.yaml
+++ b/services/atlasbot/vault-sync-deployment.yaml
@ -0,0 +1,34 @@
+# services/atlasbot/vault-sync-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: atlasbot-vault-sync
+  namespace: ai
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: atlasbot-vault-sync
+  template:
+    metadata:
+      labels:
+        app: atlasbot-vault-sync
+    spec:
+      serviceAccountName: atlasbot
+      containers:
+        - name: sync
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - "sleep infinity"
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /vault/secrets
+              readOnly: true
+      volumes:
+        - name: vault-secrets
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: atlasbot-vault
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -68,7 +68,13 @@ spec:
            - name: AI_CHAT_TIMEOUT_SEC
              value: "480"
            - name: AI_ATLASBOT_ENDPOINT
-              value: http://atlasbot.comms.svc.cluster.local:8090/v1/answer
+              value: http://atlasbot.ai.svc.cluster.local:8090/v1/answer
+            - name: AI_ATLASBOT_MODEL_FAST
+              value: qwen2.5-coder:7b-instruct-q4_0
+            - name: AI_ATLASBOT_MODEL_SMART
+              value: qwen2.5:14b-instruct
+            - name: AI_ATLASBOT_MODEL_GENIUS
+              value: qwen2.5:14b-instruct
            - name: AI_ATLASBOT_TIMEOUT_SEC
              value: "30"
            - name: AI_NODE_NAME
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@ -20,9 +20,9 @@ resources:
  - ingress.yaml
 images:
  - name: registry.bstein.dev/bstein/bstein-dev-home-frontend
-    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
+    newTag: 0.1.1-120 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
  - name: registry.bstein.dev/bstein/bstein-dev-home-backend
-    newTag: 0.1.1-162 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
+    newTag: 0.1.1-123 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
 configMapGenerator:
  - name: chat-ai-gateway
    namespace: bstein-dev-home
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@ -13,10 +13,7 @@ resources:
  - element-call-deployment.yaml
  - guest-register-deployment.yaml
  - guest-register-service.yaml
-  - atlasbot-deployment.yaml
-  - atlasbot-service.yaml
  - wellknown.yaml
-  - atlasbot-rbac.yaml
  - mas-secrets-ensure-rbac.yaml
  - comms-secrets-ensure-rbac.yaml
  - mas-db-ensure-rbac.yaml
@ -43,7 +40,6 @@ resources:
  - livekit-ingress.yaml
  - livekit-middlewares.yaml
  - matrix-ingress.yaml
-
 configMapGenerator:
  - name: comms-vault-env
    files:
@ -60,21 +56,8 @@ configMapGenerator:
      - server.py=scripts/guest-register/server.py
    options:
      disableNameSuffixHash: true
-  - name: atlasbot
-    files:
-      - bot.py=scripts/atlasbot/bot.py
-    options:
-      disableNameSuffixHash: true
  - name: othrys-element-host-config
    files:
      - 20-host-config.sh=scripts/element-host-config.sh
    options:
      disableNameSuffixHash: true
-  - name: atlas-kb
-    files:
-      - INDEX.md=knowledge/INDEX.md
-      - atlas.json=knowledge/catalog/atlas.json
-      - atlas-summary.json=knowledge/catalog/atlas-summary.json
-      - metrics.json=knowledge/catalog/metrics.json
-      - runbooks.json=knowledge/catalog/runbooks.json
-      - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd
--- a/services/comms/matrix-ingress.yaml
+++ b/services/comms/matrix-ingress.yaml
@ -7,6 +7,7 @@ metadata:
    kubernetes.io/ingress.class: traefik
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.tls: "true"
+    traefik.ingress.kubernetes.io/router.priority: "120"
    cert-manager.io/cluster-issuer: letsencrypt
 spec:
  ingressClassName: traefik
@ -43,6 +44,13 @@ spec:
                name: matrix-authentication-service
                port:
                  number: 8080
+          - path: /_matrix/client/r0/login
+            pathType: Prefix
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
          - path: /_matrix/client/v3/logout
            pathType: Exact
            backend:
@ -57,6 +65,41 @@ spec:
                name: matrix-authentication-service
                port:
                  number: 8080
+          - path: /account
+            pathType: Prefix
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
+          - path: /authorize
+            pathType: Prefix
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
+          - path: /oauth2
+            pathType: Prefix
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
+          - path: /.well-known/openid-configuration
+            pathType: Exact
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
+          - path: /.well-known/oauth-authorization-server
+            pathType: Exact
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
          - path: /_matrix
            pathType: Prefix
            backend:
@ -102,6 +145,13 @@ spec:
                name: matrix-authentication-service
                port:
                  number: 8080
+          - path: /_matrix/client/r0/login
+            pathType: Prefix
+            backend:
+              service:
+                name: matrix-authentication-service
+                port:
+                  number: 8080
          - path: /_matrix/client/v3/logout
            pathType: Exact
            backend:
--- a/services/comms/oneoffs/comms-secrets-ensure-job.yaml
+++ b/services/comms/oneoffs/comms-secrets-ensure-job.yaml
@ -1,12 +1,12 @@
 # services/comms/oneoffs/comms-secrets-ensure-job.yaml
-# One-off job for comms/comms-secrets-ensure-7.
-# Purpose: comms secrets ensure 7 (see container args/env in this file).
+# One-off job for comms/comms-secrets-ensure-8.
+# Purpose: comms secrets ensure 8 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: comms-secrets-ensure-7
+  name: comms-secrets-ensure-8
  namespace: comms
 spec:
  suspend: true
@ -87,6 +87,9 @@ spec:
              ensure_key "comms/synapse-redis" "redis-password" >/dev/null
              ensure_key "comms/synapse-macaroon" "macaroon_secret_key" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "bot-password" >/dev/null
+              ensure_key "comms/atlasbot-credentials-runtime" "bot-quick-password" >/dev/null
+              ensure_key "comms/atlasbot-credentials-runtime" "bot-smart-password" >/dev/null
+              ensure_key "comms/atlasbot-credentials-runtime" "bot-genius-password" >/dev/null
              ensure_key "comms/atlasbot-credentials-runtime" "seeder-password" >/dev/null

              SYN_PASS="$(ensure_key "comms/synapse-db" "POSTGRES_PASSWORD")"
--- a/services/comms/oneoffs/mas-local-users-ensure-job.yaml
+++ b/services/comms/oneoffs/mas-local-users-ensure-job.yaml
@ -1,12 +1,12 @@
 # services/comms/oneoffs/mas-local-users-ensure-job.yaml
-# One-off job for comms/mas-local-users-ensure-18.
+# One-off job for comms/mas-local-users-ensure-19.
 # Purpose: mas local users ensure 18 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: mas-local-users-ensure-18
+  name: mas-local-users-ensure-19
  namespace: comms
 spec:
  suspend: true
@ -27,6 +27,12 @@ spec:
        vault.hashicorp.com/agent-inject-secret-bot-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-bot-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-password" }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-bot-quick-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
+        vault.hashicorp.com/agent-inject-template-bot-quick-pass: |
+          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-quick-password" }}{{- end -}}
+        vault.hashicorp.com/agent-inject-secret-bot-smart-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
+        vault.hashicorp.com/agent-inject-template-bot-smart-pass: |
+          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "bot-smart-password" }}{{- end -}}
        vault.hashicorp.com/agent-inject-secret-seeder-pass: "kv/data/atlas/comms/atlasbot-credentials-runtime"
        vault.hashicorp.com/agent-inject-template-seeder-pass: |
          {{- with secret "kv/data/atlas/comms/atlasbot-credentials-runtime" -}}{{ index .Data.data "seeder-password" }}{{- end -}}
@ -92,7 +98,13 @@ spec:
            - name: SEEDER_USER
              value: othrys-seeder
            - name: BOT_USER
-              value: atlasbot
+              value: atlas-smart
+            - name: BOT_USER_QUICK
+              value: atlas-quick
+            - name: BOT_USER_SMART
+              value: atlas-smart
+            - name: BOT_USER_GENIUS
+              value: atlas-genius
          command:
            - /bin/sh
            - -c
@ -225,11 +237,27 @@ spec:
                      },
                      timeout=30,
                  )
+                  if r.status_code == 429:
+                      return False
                  if r.status_code != 200:
                      raise RuntimeError(f"login failed for {username}: {r.status_code} {r.text}")
+                  return True

              wait_for_service(MAS_ADMIN_API_BASE)
              token = admin_token()
+              bot_quick = os.environ.get("BOT_USER_QUICK", "")
+              bot_smart = os.environ.get("BOT_USER_SMART", "")
+              bot_genius = os.environ.get("BOT_USER_GENIUS", "")
+              bot_quick_pass = os.environ.get("BOT_PASS_QUICK", "")
+              bot_smart_pass = os.environ.get("BOT_PASS_SMART", "")
+              bot_genius_pass = os.environ.get("BOT_PASS_GENIUS", "") or bot_smart_pass
+
              ensure_user(token, os.environ["SEEDER_USER"], os.environ["SEEDER_PASS"])
              ensure_user(token, os.environ["BOT_USER"], os.environ["BOT_PASS"])
+              if bot_quick and bot_quick_pass:
+                  ensure_user(token, bot_quick, bot_quick_pass)
+              if bot_smart and bot_smart_pass:
+                  ensure_user(token, bot_smart, bot_smart_pass)
+              if bot_genius and bot_genius_pass:
+                  ensure_user(token, bot_genius, bot_genius_pass)
              PY
--- a/services/comms/oneoffs/synapse-admin-ensure-job.yaml
+++ b/services/comms/oneoffs/synapse-admin-ensure-job.yaml
@ -1,15 +1,15 @@
 # services/comms/oneoffs/synapse-admin-ensure-job.yaml
-# One-off job for comms/synapse-admin-ensure-3.
-# Purpose: synapse admin ensure 3 (see container args/env in this file).
+# One-off job for comms/synapse-admin-ensure-15.
+# Purpose: synapse admin ensure 15 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: synapse-admin-ensure-3
+  name: synapse-admin-ensure-15
  namespace: comms
 spec:
-  suspend: true
+  suspend: false
  backoffLimit: 0
  ttlSecondsAfterFinished: 3600
  template:
@ -32,7 +32,8 @@ spec:
                    values: ["arm64"]
      containers:
        - name: ensure
-          image: python:3.11-slim
+          image: python:3.12-slim
+          imagePullPolicy: Always
          env:
            - name: VAULT_ADDR
              value: http://vault.vault.svc.cluster.local:8200
@ -45,22 +46,20 @@ spec:
            - -c
            - |
              set -euo pipefail
-              pip install --no-cache-dir psycopg2-binary bcrypt
+              python -m pip install --no-cache-dir psycopg2-binary
              python - <<'PY'
              import json
              import os
-              import secrets
-              import string
-              import time
              import urllib.error
+              import urllib.parse
              import urllib.request

-              import bcrypt
              import psycopg2

              VAULT_ADDR = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200").rstrip("/")
              VAULT_ROLE = os.environ.get("VAULT_ROLE", "comms-secrets")
              SA_TOKEN_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+              SYNAPSE_ADMIN_URL = os.environ.get("SYNAPSE_ADMIN_URL", "").rstrip("/")
              PGHOST = "postgres-service.postgres.svc.cluster.local"
              PGPORT = 5432
              PGDATABASE = "synapse"
@ -113,48 +112,15 @@ spec:
                  with urllib.request.urlopen(req, timeout=30) as resp:
                      resp.read()

-              def random_password(length: int = 32) -> str:
-                  alphabet = string.ascii_letters + string.digits
-                  return "".join(secrets.choice(alphabet) for _ in range(length))
-
              def ensure_admin_creds(token: str) -> dict:
                  data = vault_get(token, "comms/synapse-admin")
-                  username = (data.get("username") or "").strip() or "synapse-admin"
-                  password = (data.get("password") or "").strip()
-                  if not password:
-                      password = random_password()
+                  username = "othrys-seeder"
+                  if data.get("username") != username:
                      data["username"] = username
-                  data["password"] = password
+                      data.pop("access_token", None)
                  vault_put(token, "comms/synapse-admin", data)
                  return data

-              def ensure_user(cur, cols, user_id, password, admin):
-                  now_ms = int(time.time() * 1000)
-                  values = {
-                      "name": user_id,
-                      "password_hash": bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode(),
-                      "creation_ts": now_ms,
-                  }
-
-                  def add_flag(name, flag):
-                      if name not in cols:
-                          return
-                      if cols[name]["type"] in ("smallint", "integer"):
-                          values[name] = int(flag)
-                      else:
-                          values[name] = bool(flag)
-
-                  add_flag("admin", admin)
-                  add_flag("deactivated", False)
-                  add_flag("shadow_banned", False)
-                  add_flag("is_guest", False)
-
-                  columns = list(values.keys())
-                  placeholders = ", ".join(["%s"] * len(columns))
-                  updates = ", ".join([f"{col}=EXCLUDED.{col}" for col in columns if col != "name"])
-                  query = f"INSERT INTO users ({', '.join(columns)}) VALUES ({placeholders}) ON CONFLICT (name) DO UPDATE SET {updates};"
-                  cur.execute(query, [values[c] for c in columns])
-
              def get_cols(cur):
                  cur.execute(
                      """
@ -172,30 +138,40 @@ spec:
                      }
                  return cols

-              def ensure_access_token(cur, user_id, token_value):
-                  cur.execute("SELECT COALESCE(MAX(id), 0) + 1 FROM access_tokens")
-                  token_id = cur.fetchone()[0]
-                  cur.execute(
-                      """
-                      INSERT INTO access_tokens (id, user_id, token, device_id, valid_until_ms)
-                      VALUES (%s, %s, %s, %s, NULL)
-                      ON CONFLICT (token) DO NOTHING
-                      """,
-                      (token_id, user_id, token_value, "ariadne-admin"),
-                  )
+              def admin_token_valid(token: str, user_id: str) -> bool:
+                  if not token or not SYNAPSE_ADMIN_URL:
+                      return False
+                  encoded = urllib.parse.quote(user_id, safe="")
+                  url = f"{SYNAPSE_ADMIN_URL}/_synapse/admin/v2/users/{encoded}"
+                  req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
+                  try:
+                      with urllib.request.urlopen(req, timeout=30) as resp:
+                          resp.read()
+                      return True
+                  except urllib.error.HTTPError as exc:
+                      if exc.code == 404:
+                          return True
+                      if exc.code in (401, 403):
+                          return False
+                      raise

              vault_token = vault_login()
              admin_data = ensure_admin_creds(vault_token)
-              if admin_data.get("access_token"):
-                  log("synapse admin token already present")
+              user_id = f"@{admin_data['username']}:live.bstein.dev"
+              existing_token = admin_data.get("access_token")
+              if existing_token and admin_token_valid(existing_token, user_id):
+                  log("synapse admin token already present and valid")
                  raise SystemExit(0)
+              if existing_token:
+                  log("synapse admin token invalid; rotating")
+                  admin_data.pop("access_token", None)
+                  vault_put(vault_token, "comms/synapse-admin", admin_data)

              synapse_db = vault_get(vault_token, "comms/synapse-db")
              pg_password = synapse_db.get("POSTGRES_PASSWORD")
              if not pg_password:
                  raise RuntimeError("synapse db password missing")

-              user_id = f"@{admin_data['username']}:live.bstein.dev"
              conn = psycopg2.connect(
                  host=PGHOST,
                  port=PGPORT,
@ -203,17 +179,34 @@ spec:
                  user=PGUSER,
                  password=pg_password,
              )
-              token_value = secrets.token_urlsafe(32)
              try:
                  with conn:
                      with conn.cursor() as cur:
                          cols = get_cols(cur)
-                          ensure_user(cur, cols, user_id, admin_data["password"], True)
-                          ensure_access_token(cur, user_id, token_value)
+                          if "admin" not in cols:
+                              raise RuntimeError("users.admin column missing")
+                          cur.execute(
+                              "UPDATE users SET admin = TRUE WHERE name = %s",
+                              (user_id,),
+                          )
+                          cur.execute(
+                              """
+                              SELECT token FROM access_tokens
+                              WHERE user_id = %s AND valid_until_ms IS NULL
+                              ORDER BY id DESC LIMIT 1
+                              """,
+                              (user_id,),
+                          )
+                          row = cur.fetchone()
+                          if not row:
+                              raise RuntimeError(f"no access token found for {user_id}")
+                          token_value = row[0]
              finally:
                  conn.close()

              admin_data["access_token"] = token_value
              vault_put(vault_token, "comms/synapse-admin", admin_data)
+              if not admin_token_valid(token_value, user_id):
+                  raise RuntimeError("synapse admin token validation failed")
              log("synapse admin token stored")
              PY
--- a/services/comms/oneoffs/synapse-user-seed-job.yaml
+++ b/services/comms/oneoffs/synapse-user-seed-job.yaml
@ -82,8 +82,6 @@ spec:
              value: synapse
            - name: SEEDER_USER
              value: othrys-seeder
-            - name: BOT_USER
-              value: atlasbot
          command:
            - /bin/sh
            - -c
@ -141,10 +139,8 @@ spec:
                  cur.execute(query, [values[c] for c in columns])

              seeder_user = os.environ["SEEDER_USER"]
-              bot_user = os.environ["BOT_USER"]
              server = "live.bstein.dev"
              seeder_id = f"@{seeder_user}:{server}"
-              bot_id = f"@{bot_user}:{server}"

              conn = psycopg2.connect(
                  host=os.environ["PGHOST"],
@ -158,7 +154,6 @@ spec:
                      with conn.cursor() as cur:
                          cols = get_cols(cur)
                          upsert_user(cur, cols, seeder_id, os.environ["SEEDER_PASS"], True)
-                          upsert_user(cur, cols, bot_id, os.environ["BOT_PASS"], False)
              finally:
                  conn.close()
              PY
--- a/services/comms/reset-othrys-room-job.yaml
+++ b/services/comms/reset-othrys-room-job.yaml
@ -76,7 +76,7 @@ spec:
                - name: SEEDER_USER
                  value: othrys-seeder
                - name: BOT_USER
-                  value: atlasbot
+                  value: atlas-smart
              command:
                - /bin/sh
                - -c
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -11,14 +11,21 @@ from urllib import error, parse, request

 BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
 AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
-USER = os.environ["BOT_USER"]
-PASSWORD = os.environ["BOT_PASS"]
+BOT_USER = os.environ["BOT_USER"]
+BOT_PASS = os.environ["BOT_PASS"]
+BOT_USER_QUICK = os.environ.get("BOT_USER_QUICK", "").strip()
+BOT_PASS_QUICK = os.environ.get("BOT_PASS_QUICK", "").strip()
+BOT_USER_SMART = os.environ.get("BOT_USER_SMART", "").strip()
+BOT_PASS_SMART = os.environ.get("BOT_PASS_SMART", "").strip()
+BOT_USER_GENIUS = os.environ.get("BOT_USER_GENIUS", "").strip()
+BOT_PASS_GENIUS = os.environ.get("BOT_PASS_GENIUS", "").strip()
 ROOM_ALIAS = "#othrys:live.bstein.dev"

 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5:14b-instruct")
 MODEL_FAST = os.environ.get("ATLASBOT_MODEL_FAST", "")
-MODEL_DEEP = os.environ.get("ATLASBOT_MODEL_DEEP", "")
+MODEL_SMART = os.environ.get("ATLASBOT_MODEL_SMART", os.environ.get("ATLASBOT_MODEL_DEEP", "")).strip()
+MODEL_GENIUS = os.environ.get("ATLASBOT_MODEL_GENIUS", MODEL_SMART).strip()
 FALLBACK_MODEL = os.environ.get("OLLAMA_FALLBACK_MODEL", "")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
 OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480"))
@ -31,7 +38,7 @@ VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitor
 ARIADNE_STATE_URL = os.environ.get("ARIADNE_STATE_URL", "")
 ARIADNE_STATE_TOKEN = os.environ.get("ARIADNE_STATE_TOKEN", "")

-BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
+BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{BOT_USER},atlas")
 SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")

 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
@ -39,6 +46,9 @@ MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
 MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
 MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
 THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
+QUICK_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_QUICK_TIME_BUDGET_SEC", "15"))
+SMART_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_SMART_TIME_BUDGET_SEC", "45"))
+GENIUS_TIME_BUDGET_SEC = float(os.environ.get("ATLASBOT_GENIUS_TIME_BUDGET_SEC", "180"))
 OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
 OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"

@ -380,27 +390,104 @@ def _strip_bot_mention(text: str) -> str:
    return cleaned or text.strip()


-def _detect_mode_from_body(body: str, *, default: str = "deep") -> str:
+def _detect_mode_from_body(body: str, *, default: str = "smart") -> str:
    lower = normalize_query(body or "")
    if "atlas_quick" in lower or "atlas-quick" in lower:
        return "fast"
    if "atlas_smart" in lower or "atlas-smart" in lower:
-        return "deep"
+        return "smart"
+    if "atlas_genius" in lower or "atlas-genius" in lower:
+        return "genius"
    if lower.startswith("quick ") or lower.startswith("fast "):
        return "fast"
-    if lower.startswith("smart ") or lower.startswith("deep "):
-        return "deep"
+    if lower.startswith("smart "):
+        return "smart"
+    if lower.startswith("genius ") or lower.startswith("deep "):
+        return "genius"
    return default


+def _detect_mode(
+    content: dict[str, Any],
+    body: str,
+    *,
+    default: str = "smart",
+    account_user: str = "",
+) -> str:
+    mode = _detect_mode_from_body(body, default=default)
+    mentions = content.get("m.mentions", {})
+    user_ids = mentions.get("user_ids", [])
+    if isinstance(user_ids, list):
+        normalized = {normalize_user_id(uid).lower() for uid in user_ids if isinstance(uid, str)}
+        if BOT_USER_QUICK and normalize_user_id(BOT_USER_QUICK).lower() in normalized:
+            return "fast"
+        if BOT_USER_SMART and normalize_user_id(BOT_USER_SMART).lower() in normalized:
+            return "smart"
+        if BOT_USER_GENIUS and normalize_user_id(BOT_USER_GENIUS).lower() in normalized:
+            return "genius"
+        if BOT_USER and normalize_user_id(BOT_USER).lower() in normalized:
+            return "smart"
+    if account_user and BOT_USER_QUICK and normalize_user_id(account_user) == normalize_user_id(BOT_USER_QUICK):
+        return "fast"
+    if account_user and BOT_USER_SMART and normalize_user_id(account_user) == normalize_user_id(BOT_USER_SMART):
+        return "smart"
+    if account_user and BOT_USER_GENIUS and normalize_user_id(account_user) == normalize_user_id(BOT_USER_GENIUS):
+        return "genius"
+    return mode
+
+
 def _model_for_mode(mode: str) -> str:
    if mode == "fast" and MODEL_FAST:
        return MODEL_FAST
-    if mode == "deep" and MODEL_DEEP:
-        return MODEL_DEEP
+    if mode == "smart" and MODEL_SMART:
+        return MODEL_SMART
+    if mode == "genius" and MODEL_GENIUS:
+        return MODEL_GENIUS
+    if mode == "deep" and MODEL_SMART:
+        return MODEL_SMART
    return MODEL


+def _normalize_mode(mode: str) -> str:
+    normalized = (mode or "").strip().lower()
+    if normalized in {"quick", "fast"}:
+        return "fast"
+    if normalized in {"smart"}:
+        return "smart"
+    if normalized in {"genius", "deep"}:
+        return "genius"
+    return "smart"
+
+
+def _mode_time_budget_sec(mode: str) -> float:
+    normalized = _normalize_mode(mode)
+    if normalized == "fast":
+        return max(1.0, QUICK_TIME_BUDGET_SEC)
+    if normalized == "smart":
+        return max(1.0, SMART_TIME_BUDGET_SEC)
+    if normalized == "genius":
+        return max(1.0, GENIUS_TIME_BUDGET_SEC)
+    return max(1.0, SMART_TIME_BUDGET_SEC)
+
+
+def _mode_ollama_timeout_sec(mode: str) -> float:
+    normalized = _normalize_mode(mode)
+    budget = _mode_time_budget_sec(normalized)
+    if normalized == "fast":
+        return max(6.0, min(budget - 2.0, OLLAMA_TIMEOUT_SEC))
+    if normalized == "smart":
+        return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
+    if normalized == "genius":
+        return max(20.0, min(budget - 10.0, OLLAMA_TIMEOUT_SEC))
+    return max(12.0, min(budget - 5.0, OLLAMA_TIMEOUT_SEC))
+
+
+def _mode_heartbeat_sec(mode: str) -> int:
+    normalized = _normalize_mode(mode)
+    budget = _mode_time_budget_sec(normalized)
+    return max(5, min(THINKING_INTERVAL_SEC, int(max(5.0, budget / 3.0))))
+
+
 # Matrix HTTP helper.
 def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
    url = (base or BASE) + path
@ -416,12 +503,12 @@ def req(method: str, path: str, token: str | None = None, body=None, timeout=60,
        raw = resp.read()
        return json.loads(raw.decode()) if raw else {}

-def login() -> str:
-    login_user = normalize_user_id(USER)
+def login(user: str, password: str) -> str:
+    login_user = normalize_user_id(user)
    payload = {
        "type": "m.login.password",
        "identifier": {"type": "m.id.user", "user": login_user},
-        "password": PASSWORD,
+        "password": password,
    }
    res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
    return res["access_token"]
@ -2628,6 +2715,11 @@ def _append_history_context(context: str, history_lines: list[str]) -> str:
    return combined


+def _merge_context_blocks(*blocks: str) -> str:
+    parts = [block.strip() for block in blocks if isinstance(block, str) and block.strip()]
+    return "\n\n".join(parts)
+
+
 class ThoughtState:
    def __init__(self, total_steps: int = 0):
        self._lock = threading.Lock()
@ -2985,6 +3077,7 @@ def _ollama_call_safe(
    fallback: str,
    system_override: str | None = None,
    model: str | None = None,
+    timeout: float | None = None,
 ) -> str:
    try:
        return _ollama_call(
@ -2994,6 +3087,7 @@ def _ollama_call_safe(
            use_history=False,
            system_override=system_override,
            model=model,
+            timeout=timeout,
        )
    except Exception:
        return fallback
@ -3813,9 +3907,12 @@ def _open_ended_multi(


 def _open_ended_total_steps(mode: str) -> int:
-    if mode == "fast":
+    normalized = _normalize_mode(mode)
+    if normalized == "fast":
        return 2
-    return 9
+    if normalized == "smart":
+        return 3
+    return 4


 def _fast_fact_lines(
@ -4136,6 +4233,7 @@ def _open_ended_fast_single(
    prompt: str,
    *,
    context: str,
+    fallback_context: str | None = None,
    history_lines: list[str] | None = None,
    state: ThoughtState | None = None,
    model: str,
@ -4143,24 +4241,26 @@ def _open_ended_fast_single(
    if state:
        state.update("drafting", step=1, note="summarizing")
    working_context = _append_history_context(context, history_lines or []) if history_lines else context
-    reply = _ollama_call(
+    reply = _ollama_call_safe(
        ("atlasbot_fast", "atlasbot_fast"),
        prompt,
        context=working_context,
-        use_history=False,
+        fallback="",
        system_override=_open_ended_system(),
        model=model,
+        timeout=_mode_ollama_timeout_sec("fast"),
    )
    if not _has_body_lines(reply):
-        reply = _ollama_call(
+        reply = _ollama_call_safe(
            ("atlasbot_fast", "atlasbot_fast"),
            prompt + " Provide one clear sentence before the score lines.",
            context=working_context,
-            use_history=False,
+            fallback="",
            system_override=_open_ended_system(),
            model=model,
+            timeout=_mode_ollama_timeout_sec("fast"),
        )
-    fallback = _fallback_fact_answer(prompt, context)
+    fallback = _fallback_fact_answer(prompt, fallback_context or context)
    if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
        reply = fallback
    if not _has_body_lines(reply):
@ -4177,6 +4277,7 @@ def _open_ended_fast(
    fact_lines: list[str],
    fact_meta: dict[str, dict[str, Any]],
    history_lines: list[str],
+    extra_context: str = "",
    state: ThoughtState | None = None,
 ) -> str:
    model = _model_for_mode("fast")
@ -4197,6 +4298,7 @@ def _open_ended_fast(
    selected_pack = _fact_pack_text(selected_lines, selected_meta)
    if _needs_full_fact_pack(prompt) or not selected_lines:
        selected_pack = fact_pack
+    model_context = _merge_context_blocks(selected_pack, extra_context)
    if not subjective and _needs_full_fact_pack(prompt):
        fallback = _fallback_fact_answer(prompt, fact_pack)
        if fallback:
@ -4205,7 +4307,8 @@ def _open_ended_fast(
        state.total_steps = _open_ended_total_steps("fast")
    return _open_ended_fast_single(
        prompt,
-        context=selected_pack,
+        context=model_context,
+        fallback_context=selected_pack,
        history_lines=history_lines,
        state=state,
        model=model,
@ -4219,16 +4322,55 @@ def _open_ended_deep(
    fact_lines: list[str],
    fact_meta: dict[str, dict[str, Any]],
    history_lines: list[str],
+    mode: str,
+    extra_context: str = "",
    state: ThoughtState | None = None,
 ) -> str:
-    return _open_ended_multi(
-        prompt,
-        fact_pack=fact_pack,
-        fact_lines=fact_lines,
-        fact_meta=fact_meta,
-        history_lines=history_lines,
-        state=state,
+    normalized = _normalize_mode(mode)
+    model = _model_for_mode(normalized)
+    subjective = _is_subjective_query(prompt)
+    primary_tags = _primary_tags_for_prompt(prompt)
+    focus_tags = _preferred_tags_for_prompt(prompt)
+    if not focus_tags and subjective:
+        focus_tags = set(_ALLOWED_INSIGHT_TAGS)
+    avoid_tags = _history_focus_tags(history_lines) if (subjective or _is_followup_query(prompt)) else set()
+    limit = 12 if normalized == "smart" else 18
+    selected_lines = _fast_fact_lines(
+        fact_lines,
+        fact_meta,
+        focus_tags=focus_tags,
+        avoid_tags=avoid_tags,
+        primary_tags=primary_tags,
+        limit=limit,
    )
+    selected_meta = _fact_pack_meta(selected_lines)
+    selected_pack = _fact_pack_text(selected_lines, selected_meta)
+    if _needs_full_fact_pack(prompt) or not selected_lines or normalized == "genius":
+        selected_pack = fact_pack
+    fallback = _fallback_fact_answer(prompt, selected_pack)
+    model_context = _merge_context_blocks(selected_pack, extra_context)
+    if not subjective and fallback:
+        if state:
+            state.update("done", step=_open_ended_total_steps(normalized))
+        return _ensure_scores(fallback)
+    if state:
+        state.update("drafting", step=1, note="synthesizing")
+    reply = _ollama_call_safe(
+        ("atlasbot_deep", "atlasbot_deep"),
+        prompt,
+        context=_append_history_context(model_context, history_lines),
+        fallback="",
+        system_override=_open_ended_system(),
+        model=model,
+        timeout=_mode_ollama_timeout_sec(normalized),
+    )
+    if fallback and (_is_quantitative_prompt(prompt) or not _has_body_lines(reply)):
+        reply = fallback
+    if not _has_body_lines(reply):
+        reply = "I don't have enough data in the current snapshot to answer that."
+    if state:
+        state.update("done", step=_open_ended_total_steps(normalized))
+    return _ensure_scores(reply)


 def open_ended_answer(
@ -4240,6 +4382,7 @@ def open_ended_answer(
    history_lines: list[str],
    mode: str,
    allow_tools: bool,
+    context: str = "",
    state: ThoughtState | None = None,
 ) -> str:
    lines = _fact_pack_lines(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
@ -4256,13 +4399,15 @@ def open_ended_answer(
        return _ensure_scores("I don't have enough data to answer that.")
    fact_meta = _fact_pack_meta(lines)
    fact_pack = _fact_pack_text(lines, fact_meta)
-    if mode == "fast":
+    normalized = _normalize_mode(mode)
+    if normalized == "fast":
        return _open_ended_fast(
            prompt,
            fact_pack=fact_pack,
            fact_lines=lines,
            fact_meta=fact_meta,
            history_lines=history_lines,
+            extra_context=context,
            state=state,
        )
    return _open_ended_deep(
@ -4271,6 +4416,8 @@ def open_ended_answer(
        fact_lines=lines,
        fact_meta=fact_meta,
        history_lines=history_lines,
+        extra_context=context,
+        mode=normalized,
        state=state,
    )

@ -4292,6 +4439,7 @@ def _non_cluster_reply(prompt: str, *, history_lines: list[str], mode: str) -> s
        use_history=False,
        system_override=system,
        model=model,
+        timeout=_mode_ollama_timeout_sec(mode),
    )
    reply = re.sub(r"\bconfidence\s*:\s*(high|medium|low)\b\.?\s*", "", reply, flags=re.IGNORECASE).strip()
    return _ensure_scores(reply)
@ -4343,13 +4491,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
            self._write_json(400, {"error": "missing_prompt"})
            return
        cleaned = _strip_bot_mention(prompt)
-        mode = str(payload.get("mode") or "deep").lower()
-        if mode in ("quick", "fast"):
-            mode = "fast"
-        elif mode in ("smart", "deep"):
-            mode = "deep"
-        else:
-            mode = "deep"
+        mode = _normalize_mode(str(payload.get("mode") or "smart"))
        snapshot = _snapshot_state()
        inventory = _snapshot_inventory(snapshot) or node_inventory_live()
        workloads = _snapshot_workloads(snapshot)
@ -4386,6 +4528,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                history_lines=history_lines,
                mode=mode,
                allow_tools=True,
+                context=context,
                state=None,
            )
        else:
@ -4640,6 +4783,7 @@ def _ollama_call(
    use_history: bool = True,
    system_override: str | None = None,
    model: str | None = None,
+    timeout: float | None = None,
 ) -> str:
    system = system_override or (
        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
@ -4673,6 +4817,7 @@ def _ollama_call(
    messages.append({"role": "user", "content": prompt})

    model_name = model or MODEL
+    request_timeout = timeout if timeout is not None else OLLAMA_TIMEOUT_SEC
    payload = {"model": model_name, "messages": messages, "stream": False}
    headers = {"Content-Type": "application/json"}
    if API_KEY:
@ -4683,13 +4828,13 @@ def _ollama_call(
        lock.acquire()
    try:
        try:
-            with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+            with request.urlopen(r, timeout=request_timeout) as resp:
                data = json.loads(resp.read().decode())
        except error.HTTPError as exc:
            if exc.code == 404 and FALLBACK_MODEL and FALLBACK_MODEL != payload["model"]:
                payload["model"] = FALLBACK_MODEL
                r = request.Request(endpoint, data=json.dumps(payload).encode(), headers=headers)
-                with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+                with request.urlopen(r, timeout=request_timeout) as resp:
                    data = json.loads(resp.read().decode())
            else:
                raise
@ -4714,6 +4859,7 @@ def ollama_reply(
    fallback: str = "",
    use_history: bool = True,
    model: str | None = None,
+    timeout: float | None = None,
 ) -> str:
    last_error = None
    for attempt in range(max(1, OLLAMA_RETRIES + 1)):
@ -4724,6 +4870,7 @@ def ollama_reply(
                context=context,
                use_history=use_history,
                model=model,
+                timeout=timeout,
            )
        except Exception as exc:  # noqa: BLE001
            last_error = exc
@ -4744,11 +4891,13 @@ def ollama_reply_with_thinking(
    fallback: str,
    use_history: bool = True,
    model: str | None = None,
+    timeout: float | None = None,
 ) -> str:
    result: dict[str, str] = {"reply": ""}
    done = threading.Event()

    def worker():
+        try:
            result["reply"] = ollama_reply(
                hist_key,
                prompt,
@ -4756,7 +4905,9 @@ def ollama_reply_with_thinking(
                fallback=fallback,
                use_history=use_history,
                model=model,
+                timeout=timeout,
            )
+        finally:
            done.set()

    thread = threading.Thread(target=worker, daemon=True)
@ -4789,6 +4940,7 @@ def open_ended_with_thinking(
    history_lines: list[str],
    mode: str,
    allow_tools: bool,
+    context: str = "",
 ) -> str:
    result: dict[str, str] = {"reply": ""}
    done = threading.Event()
@ -4796,6 +4948,7 @@ def open_ended_with_thinking(
    state = ThoughtState(total_steps=total_steps)

    def worker():
+        try:
            result["reply"] = open_ended_answer(
                prompt,
                inventory=inventory,
@ -4804,15 +4957,17 @@ def open_ended_with_thinking(
                history_lines=history_lines,
                mode=mode,
                allow_tools=allow_tools,
+                context=context,
                state=state,
            )
+        finally:
            done.set()

    thread = threading.Thread(target=worker, daemon=True)
    thread.start()
    if not done.wait(2.0):
        send_msg(token, room, "Thinking…")
-        heartbeat = max(10, THINKING_INTERVAL_SEC)
+        heartbeat = _mode_heartbeat_sec(mode)
        next_heartbeat = time.monotonic() + heartbeat
        while not done.wait(max(0, next_heartbeat - time.monotonic())):
            send_msg(token, room, state.status_line())
@ -4820,7 +4975,7 @@ def open_ended_with_thinking(
    thread.join(timeout=1)
    return result["reply"] or "Model backend is busy. Try again in a moment."

-def sync_loop(token: str, room_id: str):
+def sync_loop(token: str, room_id: str, *, account_user: str, default_mode: str):
    since = None
    try:
        res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
@ -4861,7 +5016,7 @@ def sync_loop(token: str, room_id: str):
                if not body:
                    continue
                sender = ev.get("sender", "")
-                if sender == f"@{USER}:live.bstein.dev":
+                if account_user and sender == normalize_user_id(account_user):
                    continue

                mentioned = is_mentioned(content, body)
@ -4874,7 +5029,12 @@ def sync_loop(token: str, room_id: str):

                cleaned_body = _strip_bot_mention(body)
                lower_body = cleaned_body.lower()
-                mode = _detect_mode_from_body(body, default="deep" if is_dm else "deep")
+                mode = _detect_mode(
+                    content,
+                    body,
+                    default=_normalize_mode(default_mode),
+                    account_user=account_user,
+                )

                # Only do live cluster introspection in DMs.
                allow_tools = is_dm
@ -4938,39 +5098,81 @@ def sync_loop(token: str, room_id: str):
                        snapshot=snapshot,
                        workloads=workloads,
                        history_lines=history[hist_key],
-                        mode=mode if mode in ("fast", "deep") else "deep",
+                        mode=_normalize_mode(mode),
                        allow_tools=allow_tools,
+                        context=context,
                    )
                else:
                    reply = _non_cluster_reply(
                        cleaned_body,
                        history_lines=history[hist_key],
-                        mode=mode if mode in ("fast", "deep") else "deep",
+                        mode=_normalize_mode(mode),
                    )
                send_msg(token, rid, reply)
                history[hist_key].append(f"Atlas: {reply}")
                history[hist_key] = history[hist_key][-80:]

-def login_with_retry():
+def login_with_retry(user: str, password: str):
    last_err = None
    for attempt in range(10):
        try:
-            return login()
+            return login(user, password)
        except Exception as exc:  # noqa: BLE001
            last_err = exc
            time.sleep(min(30, 2 ** attempt))
    raise last_err

+def _bot_accounts() -> list[dict[str, str]]:
+    accounts: list[dict[str, str]] = []
+
+    def add(user: str, password: str, mode: str):
+        if not user or not password:
+            return
+        accounts.append({"user": user, "password": password, "mode": mode})
+
+    add(BOT_USER_SMART or BOT_USER, BOT_PASS_SMART or BOT_PASS, "smart")
+    if BOT_USER_QUICK and BOT_PASS_QUICK:
+        add(BOT_USER_QUICK, BOT_PASS_QUICK, "fast")
+    if BOT_USER_GENIUS and BOT_PASS_GENIUS:
+        add(BOT_USER_GENIUS, BOT_PASS_GENIUS, "genius")
+    if BOT_USER and BOT_PASS and all(acc["user"] != BOT_USER for acc in accounts):
+        add(BOT_USER, BOT_PASS, "smart")
+
+    seen: set[str] = set()
+    unique: list[dict[str, str]] = []
+    for acc in accounts:
+        uid = normalize_user_id(acc["user"]).lower()
+        if uid in seen:
+            continue
+        seen.add(uid)
+        unique.append(acc)
+    return unique
+
 def main():
    load_kb()
    _start_http_server()
-    token = login_with_retry()
+    accounts = _bot_accounts()
+    threads: list[threading.Thread] = []
+    for acc in accounts:
+        token = login_with_retry(acc["user"], acc["password"])
        try:
            room_id = resolve_alias(token, ROOM_ALIAS)
            join_room(token, room_id)
        except Exception:
            room_id = None
-    sync_loop(token, room_id)
+        thread = threading.Thread(
+            target=sync_loop,
+            args=(token, room_id),
+            kwargs={
+                "account_user": acc["user"],
+                "default_mode": acc["mode"],
+            },
+            daemon=True,
+        )
+        thread.start()
+        threads.append(thread)
+    for thread in threads:
+        thread.join()

 if __name__ == "__main__":
    main()
--- a/services/comms/scripts/comms_vault_env.sh
+++ b/services/comms/scripts/comms_vault_env.sh
@ -7,6 +7,14 @@ read_secret() {
  tr -d '\r\n' < "${vault_dir}/$1"
 }

+read_optional() {
+  if [ -f "${vault_dir}/$1" ]; then
+    tr -d '\r\n' < "${vault_dir}/$1"
+  else
+    printf ''
+  fi
+}
+
 export TURN_STATIC_AUTH_SECRET="$(read_secret turn-secret)"
 export TURN_PASSWORD="${TURN_STATIC_AUTH_SECRET}"

@ -14,6 +22,15 @@ export LIVEKIT_API_SECRET="$(read_secret livekit-primary)"
 export LIVEKIT_SECRET="${LIVEKIT_API_SECRET}"

 export BOT_PASS="$(read_secret bot-pass)"
+export BOT_PASS_QUICK="$(read_optional bot-quick-pass)"
+export BOT_PASS_SMART="$(read_optional bot-smart-pass)"
+export BOT_PASS_GENIUS="$(read_optional bot-genius-pass)"
+if [ -z "${BOT_PASS_SMART}" ]; then
+  export BOT_PASS_SMART="${BOT_PASS}"
+fi
+if [ -z "${BOT_PASS_GENIUS}" ]; then
+  export BOT_PASS_GENIUS="${BOT_PASS_SMART}"
+fi
 export SEEDER_PASS="$(read_secret seeder-pass)"

 export CHAT_API_KEY="$(read_secret chat-matrix)"
--- a/services/comms/scripts/tests/test_atlasbot_modes.py
+++ b/services/comms/scripts/tests/test_atlasbot_modes.py
@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import importlib.util
+import os
+from pathlib import Path
+from unittest import TestCase, mock
+
+
+BOT_PATH = Path(__file__).resolve().parents[1] / "atlasbot" / "bot.py"
+
+
+def load_bot_module():
+    env = {
+        "BOT_USER": "atlas-smart",
+        "BOT_PASS": "smart-pass",
+        "BOT_USER_QUICK": "atlas-quick",
+        "BOT_PASS_QUICK": "quick-pass",
+        "BOT_USER_SMART": "atlas-smart",
+        "BOT_PASS_SMART": "smart-pass",
+        "BOT_USER_GENIUS": "atlas-genius",
+        "BOT_PASS_GENIUS": "genius-pass",
+        "OLLAMA_URL": "http://ollama.invalid",
+        "OLLAMA_MODEL": "base-model",
+        "ATLASBOT_MODEL_FAST": "fast-model",
+        "ATLASBOT_MODEL_SMART": "smart-model",
+        "ATLASBOT_MODEL_GENIUS": "genius-model",
+        "ATLASBOT_QUICK_TIME_BUDGET_SEC": "15",
+        "ATLASBOT_SMART_TIME_BUDGET_SEC": "45",
+        "ATLASBOT_GENIUS_TIME_BUDGET_SEC": "180",
+        "KB_DIR": "",
+        "VM_URL": "http://vm.invalid",
+        "ARIADNE_STATE_URL": "",
+        "ARIADNE_STATE_TOKEN": "",
+    }
+    with mock.patch.dict(os.environ, env, clear=False):
+        spec = importlib.util.spec_from_file_location("atlasbot_bot", BOT_PATH)
+        module = importlib.util.module_from_spec(spec)
+        assert spec.loader is not None
+        spec.loader.exec_module(module)
+        return module
+
+
+class AtlasbotModeTests(TestCase):
+    def setUp(self):
+        self.bot = load_bot_module()
+
+    def test_bot_accounts_include_genius_mode(self):
+        accounts = self.bot._bot_accounts()
+        by_user = {account["user"]: account["mode"] for account in accounts}
+
+        self.assertEqual(by_user["atlas-quick"], "fast")
+        self.assertEqual(by_user["atlas-smart"], "smart")
+        self.assertEqual(by_user["atlas-genius"], "genius")
+
+    def test_objective_cluster_question_uses_fact_pack_without_llm(self):
+        fact_lines = [
+            "hottest_cpu: longhorn-system (6.69)",
+            "hottest_ram: longhorn-system (36.05 GB)",
+        ]
+
+        with (
+            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
+            mock.patch.object(self.bot, "_ollama_call", side_effect=AssertionError("LLM should not be called")),
+        ):
+            reply = self.bot.open_ended_answer(
+                "what is the hottest cpu node in titan lab currently?",
+                inventory=[],
+                snapshot=None,
+                workloads=[],
+                history_lines=[],
+                mode="smart",
+                allow_tools=True,
+            )
+
+        self.assertIn("longhorn-system", reply)
+        self.assertIn("Confidence:", reply)
+
+    def test_subjective_genius_answer_uses_genius_model(self):
+        fact_lines = [
+            "hottest_cpu: longhorn-system (6.69)",
+            "worker_nodes: titan-01, titan-02, titan-03",
+        ]
+        captured: dict[str, object] = {}
+
+        def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
+            captured["model"] = model
+            captured["timeout"] = timeout
+            captured["context"] = context
+            return "The worker spread stands out because Titan keeps meaningful capacity on the same cluster. Confidence: high"
+
+        with (
+            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
+            mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
+        ):
+            reply = self.bot.open_ended_answer(
+                "what stands out about titan lab?",
+                inventory=[],
+                snapshot=None,
+                workloads=[],
+                history_lines=[],
+                mode="genius",
+                allow_tools=True,
+                context='Cluster snapshot (JSON): {"injected":true}',
+            )
+
+        self.assertIn("The worker spread stands out", reply)
+        self.assertEqual(captured["model"], "genius-model")
+        self.assertLessEqual(float(captured["timeout"]), 180.0)
+        self.assertIn('Cluster snapshot (JSON): {"injected":true}', str(captured["context"]))
+
+    def test_mode_timeouts_stay_within_budgets(self):
+        fact_lines = [
+            "hottest_cpu: longhorn-system (6.69)",
+            "worker_nodes: titan-01, titan-02, titan-03",
+        ]
+        seen: list[tuple[str, float]] = []
+
+        def fake_ollama_call(hist_key, prompt, *, context, use_history=True, system_override=None, model=None, timeout=None):
+            seen.append((str(model), float(timeout or 0)))
+            return "Atlas has a clear standout because the worker spread is healthy. Confidence: high"
+
+        with (
+            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
+            mock.patch.object(self.bot, "_ollama_call", side_effect=fake_ollama_call),
+        ):
+            for mode in ("fast", "smart", "genius"):
+                reply = self.bot.open_ended_answer(
+                    "what stands out about titan lab?",
+                    inventory=[],
+                    snapshot=None,
+                    workloads=[],
+                    history_lines=[],
+                    mode=mode,
+                    allow_tools=True,
+                )
+                self.assertIn("Confidence:", reply)
+
+        self.assertEqual([model for model, _ in seen], ["fast-model", "smart-model", "genius-model"])
+        self.assertLessEqual(seen[0][1], 15.0)
+        self.assertLessEqual(seen[1][1], 45.0)
+        self.assertLessEqual(seen[2][1], 180.0)
+
+    def test_llm_timeout_still_returns_a_conclusion(self):
+        fact_lines = [
+            "worker_nodes: titan-01, titan-02, titan-03",
+            "hottest_cpu: longhorn-system (6.69)",
+        ]
+
+        with (
+            mock.patch.object(self.bot, "_fact_pack_lines", return_value=fact_lines),
+            mock.patch.object(self.bot, "_ollama_call", side_effect=TimeoutError("simulated timeout")),
+        ):
+            reply = self.bot.open_ended_answer(
+                "what stands out about the worker nodes?",
+                inventory=[],
+                snapshot=None,
+                workloads=[],
+                history_lines=[],
+                mode="genius",
+                allow_tools=True,
+            )
+
+        self.assertIn("worker nodes", reply.lower())
+        self.assertIn("Confidence:", reply)
--- a/services/comms/seed-othrys-room.yaml
+++ b/services/comms/seed-othrys-room.yaml
@ -66,7 +66,7 @@ spec:
                - name: SEEDER_USER
                  value: othrys-seeder
                - name: BOT_USER
-                  value: atlasbot
+                  value: atlas-smart
              command:
                - /bin/sh
                - -c
--- a/services/crypto/monerod/deployment.yaml
+++ b/services/crypto/monerod/deployment.yaml
@ -29,12 +29,18 @@ spec:
                operator: In
                values: ["rpi4","rpi5"]
          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 50
+            - weight: 80
              preference:
                matchExpressions:
                - key: hardware
                  operator: In
-                  values: ["rpi4"]
+                  values: ["rpi5"]
+            - weight: 60
+              preference:
+                matchExpressions:
+                - key: kubernetes.io/hostname
+                  operator: NotIn
+                  values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
      containers:
      - name: monerod
        image: registry.bstein.dev/crypto/monerod:0.18.4.1
--- a/services/crypto/xmr-miner/xmrig-daemonset.yaml
+++ b/services/crypto/xmr-miner/xmrig-daemonset.yaml
@ -23,7 +23,7 @@ spec:
            - matchExpressions:
              - key: hardware
                operator: In
-                values: ["rpi4","rpi5"]
+                values: ["rpi5"]
      containers:
        - name: xmrig
          image: ghcr.io/tari-project/xmrig@sha256:80defbfd0b640d604c91cb5101d3642db7928e1e68ee3c6b011289b3565a39d9
--- a/services/gitea/deployment.yaml
+++ b/services/gitea/deployment.yaml
@ -123,13 +123,22 @@ spec:
              - key: hardware
                operator: In
                values: ["rpi4","rpi5"]
+              - key: longhorn
+                operator: NotIn
+                values: ["true"]
          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                - key: kubernetes.io/hostname
+                  operator: NotIn
+                  values: ["titan-13","titan-15","titan-17","titan-19"]
            - weight: 50
              preference:
                matchExpressions:
                - key: hardware
                  operator: In
-                  values: ["rpi4"]
+                  values: ["rpi5"]
      containers:
        - name: gitea
          image: gitea/gitea:1.23
--- a/services/harbor/helmrelease.yaml
+++ b/services/harbor/helmrelease.yaml
@ -245,6 +245,17 @@ spec:
        image:
          repository: registry.bstein.dev/infra/harbor-registry
          tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-registry:tag"}
+        extraEnvVars:
+          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
+            value: harbor-core
+          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
+            value: http://harbor-registry:8080/service/notifications
+          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
+            value: 5s
+          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
+            value: "5"
+          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
+            value: 1s
      controller:
        image:
          repository: registry.bstein.dev/infra/harbor-registryctl
@ -263,6 +274,10 @@ spec:
          export REGISTRY_HTTP_SECRET="{{ .Data.data.REGISTRY_HTTP_SECRET }}"
          export REGISTRY_REDIS_PASSWORD="{{ .Data.data.REGISTRY_REDIS_PASSWORD }}"
          {{ end }}
+          {{ with secret "kv/data/atlas/harbor/harbor-jobservice" }}
+          export JOBSERVICE_SECRET="{{ .Data.data.JOBSERVICE_SECRET }}"
+          export REGISTRY_NOTIFICATIONS_ENDPOINTS_0_HEADERS_Authorization="Harbor-Secret ${JOBSERVICE_SECRET}"
+          {{ end }}
        vault.hashicorp.com/agent-inject-secret-harbor-registryctl-env.sh: "kv/data/atlas/harbor/harbor-registry"
        vault.hashicorp.com/agent-inject-template-harbor-registryctl-env.sh: |
          {{ with secret "kv/data/atlas/harbor/harbor-core" }}
@ -397,10 +412,10 @@ spec:
            patch: |-
              - op: replace
                path: /spec/rules/0/http/paths/2/backend/service/name
-                value: harbor-registry
+                value: harbor-core
              - op: replace
                path: /spec/rules/0/http/paths/2/backend/service/port/number
-                value: 5000
+                value: 80
          - target:
              kind: Deployment
              name: harbor-jobservice
@ -422,8 +437,7 @@ spec:
                          - $patch: replace
                          - name: VAULT_ENV_FILE
                            value: /vault/secrets/harbor-jobservice-env.sh
-                        envFrom:
-                          - $patch: replace
+                        envFrom: []
                          - configMapRef:
                              name: harbor-jobservice-env
                        volumeMounts:
@ -464,6 +478,16 @@ spec:
                            value: /vault/secrets/harbor-registry-env.sh
                          - name: VAULT_COPY_FILES
                            value: /vault/secrets/harbor-registry-htpasswd:/etc/registry/passwd
+                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_NAME
+                            value: harbor-core
+                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_URL
+                            value: http://harbor-registry:8080/service/notifications
+                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_TIMEOUT
+                            value: 5s
+                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_THRESHOLD
+                            value: "5"
+                          - name: REGISTRY_NOTIFICATIONS_ENDPOINTS_0_BACKOFF
+                            value: 1s
                        envFrom:
                          - $patch: replace
                        volumeMounts:
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@ -67,7 +67,7 @@ data:
                      url('https://scm.bstein.dev/bstein/harbor-arm-build.git')
                      credentials('gitea-pat')
                    }
-                    branches('*/master')
+                    branches('*/main')
                  }
                }
              }
@ -108,7 +108,7 @@ data:
                      url('https://scm.bstein.dev/bstein/ci-demo.git')
                      credentials('gitea-pat')
                    }
-                    branches('*/master')
+                    branches('*/main')
                  }
                }
                scriptPath('Jenkinsfile')
@ -167,6 +167,110 @@ data:
              }
            }
          }
+          pipelineJob('metis') {
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/2 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
+            }
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/metis.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/master')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
+          pipelineJob('metis') {
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/5 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
+            }
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/metis.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/master')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
+          pipelineJob('atlasbot') {
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/2 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
+            }
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/atlasbot.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/main')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
+          pipelineJob('Soteria') {
+            properties {
+              pipelineTriggers {
+                triggers {
+                  scmTrigger {
+                    scmpoll_spec('H/5 * * * *')
+                    ignorePostCommitHooks(false)
+                  }
+                }
+              }
+            }
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/soteria.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/main')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
          pipelineJob('data-prepper') {
            properties {
              pipelineTriggers {
--- a/services/jenkins/deployment.yaml
+++ b/services/jenkins/deployment.yaml
@ -48,7 +48,7 @@ spec:
          TITAN_IAC_WEBHOOK_TOKEN={{ .Data.data.titan_iac_quality_gate }}
          GIT_NOTIFY_TOKEN_BSTEIN_DEV_HOME={{ .Data.data.git_notify_bstein_dev_home }}
          {{ end }}
-        bstein.dev/restarted-at: "2026-01-20T14:52:41Z"
+        bstein.dev/restarted-at: "2026-02-02T15:10:33Z"
    spec:
      serviceAccountName: jenkins
      nodeSelector:
--- a/services/jenkins/dind-pvc.yaml
+++ b/services/jenkins/dind-pvc.yaml
@ -0,0 +1,13 @@
+# services/jenkins/dind-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: jenkins-dind-cache
+  namespace: jenkins
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 30Gi
+  storageClassName: astreae
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@ -8,6 +8,7 @@ resources:
  - vault-serviceaccount.yaml
  - pvc.yaml
  - cache-pvc.yaml
+  - dind-pvc.yaml
  - plugins-pvc.yaml
  - configmap-jcasc.yaml
  - configmap-plugins.yaml
--- a/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
+++ b/services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
@ -1,12 +1,12 @@
 # services/keycloak/oneoffs/portal-e2e-execute-actions-email-test-job.yaml
-# One-off job for sso/keycloak-portal-e2e-execute-actions-email-14.
-# Purpose: keycloak portal e2e execute actions email 14 (see container args/env in this file).
+# One-off job for sso/keycloak-portal-e2e-execute-actions-email-18.
+# Purpose: keycloak portal e2e execute actions email 18 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-portal-e2e-execute-actions-email-14
+  name: keycloak-portal-e2e-execute-actions-email-18
  namespace: sso
 spec:
  suspend: true
@ -70,7 +70,7 @@ spec:
            - name: E2E_PROBE_USERNAME
              value: robotuser
            - name: E2E_PROBE_EMAIL
-              value: robotuser@bstein.dev
+              value: brad.stein+robot@gmail.com
            - name: EXECUTE_ACTIONS_CLIENT_ID
              value: bstein-dev-home
            - name: EXECUTE_ACTIONS_REDIRECT_URI
--- a/services/keycloak/oneoffs/realm-settings-job.yaml
+++ b/services/keycloak/oneoffs/realm-settings-job.yaml
@ -1,12 +1,12 @@
 # services/keycloak/oneoffs/realm-settings-job.yaml
-# One-off job for sso/keycloak-realm-settings-36.
-# Purpose: keycloak realm settings 36 (see container args/env in this file).
+# One-off job for sso/keycloak-realm-settings-38.
+# Purpose: keycloak realm settings 38 (see container args/env in this file).
 # Run by setting spec.suspend to false, reconcile, then set it back to true.
 # Safe to delete the finished Job/pod; it should not run continuously.
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: keycloak-realm-settings-36
+  name: keycloak-realm-settings-38
  namespace: sso
 spec:
  suspend: true
@ -64,7 +64,7 @@ spec:
            - name: KEYCLOAK_REALM
              value: atlas
            - name: KEYCLOAK_SMTP_HOST
-              value: mail.bstein.dev
+              value: smtp.postmarkapp.com
            - name: KEYCLOAK_SMTP_PORT
              value: "587"
            - name: KEYCLOAK_SMTP_FROM
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@ -18,6 +18,7 @@ spec:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
+        maintenance.bstein.dev/restart-rev: "20260207-2"
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "maintenance"
        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -105,7 +106,7 @@ spec:
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: ariadne
-          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          image: registry.bstein.dev/bstein/ariadne:latest
          imagePullPolicy: Always
          command: ["/bin/sh", "-c"]
          args:
@ -285,7 +286,7 @@ spec:
            - name: ARIADNE_SCHEDULE_MAILU_SYNC
              value: "30 4 * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -293,23 +294,23 @@ spec:
            - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
              value: "0 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_ADMIN
              value: "15 3 * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
-              value: "0 6 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_CRON
              value: "0 3 * * *"
            - name: ARIADNE_SCHEDULE_POD_CLEANER
-              value: "0 * * * *"
+              value: "*/30 * * * *"
            - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
              value: "23 3 * * *"
            - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
-              value: "30 4 * * 0"
+              value: "0 */4 * * *"
            - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
-              value: "0 * * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_VAULT_OIDC
-              value: "0 * * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -319,9 +320,9 @@ spec:
            - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
              value: "*/10 * * * *"
            - name: ARIADNE_SCHEDULE_CLUSTER_STATE
-              value: "*/15 * * * *"
+              value: "*/10 * * * *"
            - name: ARIADNE_CLUSTER_STATE_KEEP
-              value: "168"
+              value: "720"
            - name: WELCOME_EMAIL_ENABLED
              value: "true"
            - name: K8S_API_TIMEOUT_SEC
@ -330,12 +331,20 @@ spec:
              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
            - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
              value: "5"
+            - name: ARIADNE_ALERTMANAGER_URL
+              value: http://alertmanager.monitoring.svc.cluster.local
            - name: OPENSEARCH_URL
              value: http://opensearch-master.logging.svc.cluster.local:9200
            - name: OPENSEARCH_LIMIT_BYTES
              value: "1099511627776"
            - name: OPENSEARCH_INDEX_PATTERNS
              value: kube-*,journald-*,trace-analytics-*
+            - name: METIS_BASE_URL
+              value: http://metis.maintenance.svc.cluster.local
+            - name: METIS_TIMEOUT_SEC
+              value: "15"
+            - name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
+              value: "*/30 * * * *"
            - name: METRICS_PATH
              value: "/metrics"
          resources:
--- a/services/maintenance/ariadne-rbac.yaml
+++ b/services/maintenance/ariadne-rbac.yaml
@ -29,6 +29,29 @@ rules:
      - get
      - list
      - watch
+  - apiGroups: ["apps"]
+    resources:
+      - deployments
+      - statefulsets
+      - daemonsets
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups: ["longhorn.io"]
+    resources:
+      - volumes
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups: [""]
+    resources:
+      - events
+    verbs:
+      - get
+      - list
+      - watch
  - apiGroups: [""]
    resources:
      - pods/exec
@ -56,3 +79,17 @@ roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: ariadne-job-spawner
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: ariadne-auth-delegator
+subjects:
+  - kind: ServiceAccount
+    name: ariadne
+    namespace: maintenance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: system:auth-delegator
--- a/services/maintenance/image.yaml
+++ b/services/maintenance/image.yaml
@ -21,3 +21,72 @@ spec:
  policy:
    semver:
      range: ">=0.1.0-0"
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImageRepository
+metadata:
+  name: metis
+  namespace: maintenance
+spec:
+  image: registry.bstein.dev/bstein/metis
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImagePolicy
+metadata:
+  name: metis
+  namespace: maintenance
+spec:
+  imageRepositoryRef:
+    name: metis
+  policy:
+    semver:
+      range: ">=0.1.0-0"
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImageRepository
+metadata:
+  name: metis-sentinel
+  namespace: maintenance
+spec:
+  image: registry.bstein.dev/bstein/metis-sentinel
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImagePolicy
+metadata:
+  name: metis-sentinel
+  namespace: maintenance
+spec:
+  imageRepositoryRef:
+    name: metis-sentinel
+  policy:
+    semver:
+      range: ">=0.1.0-0"
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImageRepository
+metadata:
+  name: soteria
+  namespace: maintenance
+spec:
+  image: registry.bstein.dev/bstein/soteria
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1beta2
+kind: ImagePolicy
+metadata:
+  name: soteria
+  namespace: maintenance
+spec:
+  imageRepositoryRef:
+    name: soteria
+  policy:
+    semver:
+      range: ">=0.1.0-0"
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@ -5,28 +5,50 @@ resources:
  - namespace.yaml
  - image.yaml
  - secretproviderclass.yaml
+  - soteria-configmap.yaml
+  - metis-configmap.yaml
+  - metis-data-pvc.yaml
  - vault-serviceaccount.yaml
  - vault-sync-deployment.yaml
  - ariadne-serviceaccount.yaml
  - ariadne-rbac.yaml
  - disable-k3s-traefik-serviceaccount.yaml
  - k3s-traefik-cleanup-rbac.yaml
+  - metis-serviceaccount.yaml
+  - metis-rbac.yaml
+  - metis-token-sync-serviceaccount.yaml
+  - metis-token-sync-rbac.yaml
  - node-nofile-serviceaccount.yaml
  - pod-cleaner-rbac.yaml
+  - soteria-serviceaccount.yaml
+  - soteria-rbac.yaml
  - ariadne-deployment.yaml
+  - metis-deployment.yaml
  - oneoffs/ariadne-migrate-job.yaml
  - ariadne-service.yaml
+  - soteria-deployment.yaml
  - disable-k3s-traefik-daemonset.yaml
  - oneoffs/k3s-traefik-cleanup-job.yaml
  - node-nofile-daemonset.yaml
+  - metis-sentinel-daemonset.yaml
+  - metis-k3s-token-sync-cronjob.yaml
  - k3s-agent-restart-daemonset.yaml
  - pod-cleaner-cronjob.yaml
  - node-image-sweeper-serviceaccount.yaml
  - node-image-sweeper-daemonset.yaml
  - image-sweeper-cronjob.yaml
+  - metis-service.yaml
+  - metis-ingress.yaml
+  - soteria-service.yaml
 images:
  - name: registry.bstein.dev/bstein/ariadne
-    newTag: 0.1.0-59 # {"$imagepolicy": "maintenance:ariadne:tag"}
+    newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
+  - name: registry.bstein.dev/bstein/metis
+    newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis:tag"}
+  - name: registry.bstein.dev/bstein/metis-sentinel
+    newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis-sentinel:tag"}
+  - name: registry.bstein.dev/bstein/soteria
+    newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
 configMapGenerator:
  - name: disable-k3s-traefik-script
    namespace: maintenance
--- a/services/maintenance/metis-configmap.yaml
+++ b/services/maintenance/metis-configmap.yaml
@ -0,0 +1,20 @@
+# services/maintenance/metis-configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: metis
+  namespace: maintenance
+data:
+  METIS_BIND_ADDR: :8080
+  METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
+  METIS_DATA_DIR: /var/lib/metis
+  METIS_DEFAULT_FLASH_HOST: titan-22
+  METIS_FLASH_HOSTS: titan-22
+  METIS_LOCAL_HOST: titan-22
+  METIS_ALLOWED_GROUPS: admin,maintainer
+  METIS_MAX_DEVICE_BYTES: "300000000000"
+  METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot
+  METIS_SENTINEL_INTERVAL_SEC: "1800"
+  METIS_SENTINEL_NSENTER: "1"
+  METIS_IMAGE_RPI4_ARMBIAN_LONGHORN: https://armbian.chi.auroradev.org/dl/rpi4b/archive/Armbian_26.2.1_Rpi4b_noble_current_6.18.9_minimal.img.xz
+  METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256: sha256:c450687adf4cc6a59725c43aefd58baf42ec71bdd379227d403cdde281768e46
--- a/services/maintenance/metis-data-pvc.yaml
+++ b/services/maintenance/metis-data-pvc.yaml
@ -0,0 +1,13 @@
+# services/maintenance/metis-data-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: metis-data
+  namespace: maintenance
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 40Gi
+  storageClassName: local-path
--- a/services/maintenance/metis-deployment.yaml
+++ b/services/maintenance/metis-deployment.yaml
@ -0,0 +1,47 @@
+# services/maintenance/metis-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: metis
+  namespace: maintenance
+spec:
+  replicas: 1
+  revisionHistoryLimit: 3
+  selector:
+    matchLabels:
+      app: metis
+  template:
+    metadata:
+      labels:
+        app: metis
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      serviceAccountName: metis
+      nodeSelector:
+        kubernetes.io/hostname: titan-22
+        kubernetes.io/arch: amd64
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: metis
+          image: registry.bstein.dev/bstein/metis:latest
+          imagePullPolicy: Always
+          envFrom:
+            - configMapRef:
+                name: metis
+          ports:
+            - name: http
+              containerPort: 8080
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
--- a/services/maintenance/metis-ingress.yaml
+++ b/services/maintenance/metis-ingress.yaml
@ -0,0 +1,27 @@
+# services/maintenance/metis-ingress.yaml
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: metis
+  namespace: maintenance
+  annotations:
+    kubernetes.io/ingress.class: traefik
+    cert-manager.io/cluster-issuer: letsencrypt
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+    traefik.ingress.kubernetes.io/router.middlewares: sso-oauth2-proxy-forward-auth@kubernetescrd
+spec:
+  tls:
+    - hosts: ["metis.bstein.dev"]
+      secretName: metis-tls
+  rules:
+    - host: metis.bstein.dev
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: metis
+                port:
+                  number: 80
--- a/services/maintenance/metis-k3s-token-sync-cronjob.yaml
+++ b/services/maintenance/metis-k3s-token-sync-cronjob.yaml
@ -0,0 +1,51 @@
+# services/maintenance/metis-k3s-token-sync-cronjob.yaml
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: metis-k3s-token-sync
+  namespace: maintenance
+spec:
+  schedule: "11 */6 * * *"
+  concurrencyPolicy: Forbid
+  successfulJobsHistoryLimit: 1
+  failedJobsHistoryLimit: 2
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          serviceAccountName: metis-token-sync
+          restartPolicy: OnFailure
+          nodeSelector:
+            kubernetes.io/arch: arm64
+            node-role.kubernetes.io/control-plane: "true"
+          tolerations:
+            - key: node-role.kubernetes.io/control-plane
+              operator: Exists
+              effect: NoSchedule
+            - key: node-role.kubernetes.io/master
+              operator: Exists
+              effect: NoSchedule
+          containers:
+            - name: sync
+              image: registry.bstein.dev/bstein/kubectl:1.35.0
+              imagePullPolicy: IfNotPresent
+              command:
+                - /bin/sh
+                - -c
+              args:
+                - |
+                  set -euo pipefail
+                  token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/node-token)"
+                  kubectl -n maintenance create secret generic metis-runtime \
+                    --from-literal=k3s_token="${token}" \
+                    --dry-run=client -o yaml | kubectl apply -f -
+              securityContext:
+                runAsUser: 0
+              volumeMounts:
+                - name: k3s-server
+                  mountPath: /host/var/lib/rancher/k3s/server
+                  readOnly: true
+          volumes:
+            - name: k3s-server
+              hostPath:
+                path: /var/lib/rancher/k3s/server
--- a/services/maintenance/metis-rbac.yaml
+++ b/services/maintenance/metis-rbac.yaml
@ -0,0 +1,27 @@
+# services/maintenance/metis-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: metis-node-manager
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+    verbs:
+      - get
+      - list
+      - watch
+      - delete
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: metis-node-manager
+subjects:
+  - kind: ServiceAccount
+    name: metis
+    namespace: maintenance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: metis-node-manager
--- a/services/maintenance/metis-sentinel-daemonset.yaml
+++ b/services/maintenance/metis-sentinel-daemonset.yaml
@ -0,0 +1,133 @@
+# services/maintenance/metis-sentinel-daemonset.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: metis-sentinel
+  namespace: maintenance
+spec:
+  selector:
+    matchLabels:
+      app: metis-sentinel
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: metis-sentinel
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+    spec:
+      serviceAccountName: metis
+      nodeSelector:
+        kubernetes.io/os: linux
+        node-role.kubernetes.io/worker: "true"
+      containers:
+        - name: metis-sentinel
+          image: registry.bstein.dev/bstein/metis-sentinel:latest
+          imagePullPolicy: Always
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - |
+              set -eu
+              out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
+              interval="${METIS_SENTINEL_INTERVAL_SEC:-120}"
+              mkdir -p "${out_dir}"
+              while true; do
+                ts="$(date -u +%Y%m%dT%H%M%SZ)"
+                node="${METIS_SENTINEL_NODE:-unknown}"
+                tmp="${out_dir}/${node}-${ts}.json.tmp"
+                out="${out_dir}/${node}-${ts}.json"
+                if metis-sentinel > "${tmp}"; then
+                  mv "${tmp}" "${out}"
+                else
+                  rm -f "${tmp}" || true
+                fi
+                sleep "${interval}"
+              done
+          envFrom:
+            - configMapRef:
+                name: metis
+          env:
+            - name: METIS_SENTINEL_NODE
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          ports:
+            - name: http
+              containerPort: 8080
+          volumeMounts:
+            - name: sentinel-output
+              mountPath: /var/run/metis-sentinel
+          resources:
+            requests:
+              cpu: 25m
+              memory: 64Mi
+            limits:
+              cpu: 250m
+              memory: 256Mi
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsUser: 0
+            capabilities:
+              drop: ["ALL"]
+        - name: sentinel-pusher
+          image: curlimages/curl:8.12.1
+          imagePullPolicy: IfNotPresent
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - |
+              set -eu
+              out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
+              push_url="${METIS_SENTINEL_PUSH_URL:-}"
+              interval="${METIS_SENTINEL_PUSH_INTERVAL_SEC:-120}"
+              timeout="${METIS_SENTINEL_PUSH_TIMEOUT_SEC:-10}"
+              mkdir -p "${out_dir}"
+              while true; do
+                for snapshot in "${out_dir}"/*.json; do
+                  [ -f "${snapshot}" ] || continue
+                  if [ -z "${push_url}" ]; then
+                    break
+                  fi
+                  if curl -fsS --connect-timeout "${timeout}" --max-time "${timeout}" \
+                    -X POST \
+                    -H "Content-Type: application/json" \
+                    -H "X-Metis-Node: ${METIS_SENTINEL_NODE:-unknown}" \
+                    --data-binary "@${snapshot}" \
+                    "${push_url}"; then
+                    rm -f "${snapshot}"
+                  fi
+                done
+                sleep "${interval}"
+              done
+          envFrom:
+            - configMapRef:
+                name: metis
+          env:
+            - name: METIS_SENTINEL_NODE
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          volumeMounts:
+            - name: sentinel-output
+              mountPath: /var/run/metis-sentinel
+          resources:
+            requests:
+              cpu: 10m
+              memory: 32Mi
+            limits:
+              cpu: 100m
+              memory: 128Mi
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsUser: 0
+            capabilities:
+              drop: ["ALL"]
+      volumes:
+        - name: sentinel-output
+          emptyDir: {}
--- a/services/maintenance/metis-service.yaml
+++ b/services/maintenance/metis-service.yaml
@ -0,0 +1,18 @@
+# services/maintenance/metis-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: metis
+  namespace: maintenance
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "80"
+    prometheus.io/path: "/metrics"
+spec:
+  type: ClusterIP
+  selector:
+    app: metis
+  ports:
+    - name: http
+      port: 80
+      targetPort: http
--- a/services/maintenance/metis-serviceaccount.yaml
+++ b/services/maintenance/metis-serviceaccount.yaml
@ -0,0 +1,6 @@
+# services/maintenance/metis-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: metis
+  namespace: maintenance
--- a/services/maintenance/metis-token-sync-rbac.yaml
+++ b/services/maintenance/metis-token-sync-rbac.yaml
@ -0,0 +1,30 @@
+# services/maintenance/metis-token-sync-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: metis-token-sync
+  namespace: maintenance
+rules:
+  - apiGroups: [""]
+    resources:
+      - secrets
+    verbs:
+      - get
+      - list
+      - create
+      - update
+      - patch
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: metis-token-sync
+  namespace: maintenance
+subjects:
+  - kind: ServiceAccount
+    name: metis-token-sync
+    namespace: maintenance
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: metis-token-sync
--- a/services/maintenance/metis-token-sync-serviceaccount.yaml
+++ b/services/maintenance/metis-token-sync-serviceaccount.yaml
@ -0,0 +1,6 @@
+# services/maintenance/metis-token-sync-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: metis-token-sync
+  namespace: maintenance
--- a/services/maintenance/node-image-sweeper-daemonset.yaml
+++ b/services/maintenance/node-image-sweeper-daemonset.yaml
@ -10,6 +10,8 @@ spec:
      app: node-image-sweeper
  updateStrategy:
    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 100%
  template:
    metadata:
      labels:
@ -29,6 +31,21 @@ spec:
        - name: node-image-sweeper
          image: python:3.12.9-alpine3.20
          command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
+          env:
+            - name: SWEEP_INTERVAL_SEC
+              value: "21600"
+            - name: HIGH_USAGE_PERCENT
+              value: "70"
+            - name: EMERGENCY_USAGE_PERCENT
+              value: "80"
+            - name: BASE_THRESHOLD_DAYS
+              value: "14"
+            - name: HIGH_USAGE_THRESHOLD_DAYS
+              value: "3"
+            - name: LOG_RETENTION_DAYS
+              value: "7"
+            - name: JOURNAL_MAX_SIZE
+              value: "200M"
          securityContext:
            privileged: true
            runAsUser: 0
--- a/services/maintenance/scripts/node_image_sweeper.sh
+++ b/services/maintenance/scripts/node_image_sweeper.sh
@ -2,26 +2,39 @@
 set -eu

 ONE_SHOT=${ONE_SHOT:-false}
-THRESHOLD_DAYS=14
+SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
+BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
+HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
+HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
+EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
+LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
+JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
+SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"

-usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
-if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
-  THRESHOLD_DAYS=3
-fi
+sweep_once() {
+  usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
+  threshold_days="${BASE_THRESHOLD_DAYS}"
+  if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
+    threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
+  fi

-cutoff=$(python3 - <<'PY'
-import time, os
-print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
+  cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
+import os
+import time
+
+days = int(os.environ.get("THRESHOLD_DAYS", "14"))
+print(int(time.time()) - days * 86400)
 PY
 )

-RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
-IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
+  RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
+  IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')

-SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
-
-prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
-import json, os, sys, time
+  prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
+import json
+import os
+import sys
+import time

 try:
    data = json.load(sys.stdin)
@ -74,19 +87,33 @@ for p in prune:
 PY
 )

-if [ -n "${prune_list}" ]; then
+  if [ -n "${prune_list}" ]; then
    printf "%s" "${prune_list}" | while read -r image_id; do
      if [ -n "${image_id}" ]; then
        chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
      fi
    done
-fi
+  fi

-find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
-find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
+  find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
+  find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
+
+  if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
+    # Emergency pass for rootfs pressure on SD-backed nodes.
+    chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
+    find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
+    find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
+    chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
+  fi
+}
+
+sweep_once

 if [ "${ONE_SHOT}" = "true" ]; then
  exit 0
 fi

-sleep infinity
+while true; do
+  sleep "${SWEEP_INTERVAL_SEC}"
+  sweep_once
+done
--- a/services/maintenance/soteria-configmap.yaml
+++ b/services/maintenance/soteria-configmap.yaml
@ -0,0 +1,10 @@
+# services/maintenance/soteria-configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: soteria
+  namespace: maintenance
+data:
+  SOTERIA_BACKUP_DRIVER: "longhorn"
+  SOTERIA_LONGHORN_URL: "http://longhorn-backend.longhorn-system.svc:9500"
+  SOTERIA_LONGHORN_BACKUP_MODE: "incremental"
--- a/services/maintenance/soteria-deployment.yaml
+++ b/services/maintenance/soteria-deployment.yaml
@ -0,0 +1,73 @@
+# services/maintenance/soteria-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: soteria
+  namespace: maintenance
+spec:
+  replicas: 1
+  revisionHistoryLimit: 3
+  selector:
+    matchLabels:
+      app: soteria
+  template:
+    metadata:
+      labels:
+        app: soteria
+    spec:
+      serviceAccountName: soteria
+      nodeSelector:
+        kubernetes.io/arch: arm64
+        node-role.kubernetes.io/worker: "true"
+      affinity:
+        nodeAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 90
+              preference:
+                matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values: ["rpi5"]
+            - weight: 50
+              preference:
+                matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values: ["rpi4"]
+      containers:
+        - name: soteria
+          image: registry.bstein.dev/bstein/soteria:latest
+          imagePullPolicy: Always
+          ports:
+            - name: http
+              containerPort: 8080
+          envFrom:
+            - configMapRef:
+                name: soteria
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            timeoutSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /readyz
+              port: http
+            initialDelaySeconds: 2
+            periodSeconds: 5
+            timeoutSeconds: 2
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+            runAsUser: 65532
+            capabilities:
+              drop: ["ALL"]
--- a/services/maintenance/soteria-rbac.yaml
+++ b/services/maintenance/soteria-rbac.yaml
@ -0,0 +1,22 @@
+# services/maintenance/soteria-rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: soteria
+rules:
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims", "persistentvolumes"]
+    verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: soteria
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: soteria
+subjects:
+  - kind: ServiceAccount
+    name: soteria
+    namespace: maintenance
--- a/services/maintenance/soteria-service.yaml
+++ b/services/maintenance/soteria-service.yaml
@ -0,0 +1,14 @@
+# services/maintenance/soteria-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: soteria
+  namespace: maintenance
+spec:
+  type: ClusterIP
+  selector:
+    app: soteria
+  ports:
+    - name: http
+      port: 80
+      targetPort: http
--- a/services/maintenance/soteria-serviceaccount.yaml
+++ b/services/maintenance/soteria-serviceaccount.yaml
@ -0,0 +1,8 @@
+# services/maintenance/soteria-serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: soteria
+  namespace: maintenance
+imagePullSecrets:
+  - name: harbor-regcred
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@ -20,7 +20,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -89,7 +89,7 @@
      },
      "targets": [
        {
-          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
+          "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@ -1125,7 +1125,7 @@
    {
      "id": 17,
      "type": "stat",
-      "title": "Ariadne CI Coverage (%)",
+      "title": "Platform CI Coverage (%)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1138,7 +1138,7 @@
      },
      "targets": [
        {
-          "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+          "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
          "refId": "A",
          "legendFormat": "{{branch}}",
          "instant": true
@ -1183,12 +1183,13 @@
          "values": false
        },
        "textMode": "value"
-      }
+      },
+      "description": "Internal source panel for Atlas Overview automation test rollups."
    },
    {
      "id": 18,
      "type": "table",
-      "title": "Ariadne CI Tests (latest)",
+      "title": "Platform CI Tests (latest)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1201,7 +1202,7 @@
      },
      "targets": [
        {
-          "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+          "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
          "refId": "A",
          "instant": true
        }
@ -1233,7 +1234,8 @@
            "order": "desc"
          }
        }
-      ]
+      ],
+      "description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
    }
  ],
  "time": {
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -1677,7 +1677,7 @@
    {
      "id": 42,
      "type": "timeseries",
-      "title": "Ariadne Test Success Rate",
+      "title": "Platform Test Success Rate",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1690,7 +1690,7 @@
      },
      "targets": [
        {
-          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
+          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
          "refId": "A"
        }
      ],
@ -1709,12 +1709,13 @@
        "tooltip": {
          "mode": "multi"
        }
-      }
+      },
+      "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
    },
    {
      "id": 43,
      "type": "bargauge",
-      "title": "Tests with Failures (24h)",
+      "title": "Platform Tests with Failures (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1727,7 +1728,7 @@
      },
      "targets": [
        {
-          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
+          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
          "refId": "A",
          "legendFormat": "{{result}}",
          "instant": true
@ -1814,7 +1815,8 @@
            "order": "desc"
          }
        }
-      ]
+      ],
+      "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
    },
    {
      "id": 11,
@ -1901,7 +1903,7 @@
      },
      "targets": [
        {
-          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+          "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@ -22,7 +22,24 @@ data:
      - orgId: 1
        receiver: email-admins
        group_by:
+          - grafana_folder
          - alertname
+        group_wait: 1m
+        group_interval: 30m
+        repeat_interval: 12h
+        routes:
+          - receiver: email-admins
+            object_matchers:
+              - [severity, "=", "critical"]
+            group_wait: 30s
+            group_interval: 5m
+            repeat_interval: 2h
+          - receiver: email-admins
+            object_matchers:
+              - [severity, "=", "warning"]
+            group_wait: 5m
+            group_interval: 2h
+            repeat_interval: 24h
  rules.yaml: |
    apiVersion: 1
    groups:
@ -32,7 +49,7 @@ data:
        interval: 1m
        rules:
          - uid: disk-pressure-root
-            title: "Node rootfs high (>80%)"
+            title: "Node rootfs high (>85%)"
            condition: C
            for: "10m"
            data:
@ -66,7 +83,7 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [80]
+                        params: [85]
                        type: gt
                      operator:
                        type: and
@ -76,7 +93,7 @@ data:
            noDataState: NoData
            execErrState: Error
            annotations:
-              summary: "{{ $labels.node }} rootfs >80% for 10m"
+              summary: "{{ $labels.node }} rootfs >85% for 10m"
            labels:
              severity: warning
          - uid: disk-growth-1h
@ -145,7 +162,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")
+                  expr: avg_over_time((1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100)[10m:1m] * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")
                  legendFormat: '{{instance}}'
                  datasource:
                    type: prometheus
@ -286,8 +303,8 @@ data:
              summary: "node-image-sweeper not fully ready"
            labels:
              severity: warning
-          - uid: maint-cron-stale
-            title: "Maintenance CronJobs stale (>3h since success)"
+          - uid: maint-ariadne-image-sweeper-stale
+            title: "Ariadne image sweeper stale (schedule >8d)"
            condition: C
            for: "5m"
            data:
@ -297,10 +314,10 @@ data:
                  to: 0
                datasourceUid: atlas-vm
                model:
-                  expr: time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"}) and on(cronjob) (kube_cronjob_spec_suspend{namespace="maintenance",cronjob="image-sweeper"} == 0)
+                  expr: time() - ariadne_schedule_last_success_timestamp_seconds{task="schedule.image_sweeper"}
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  legendFormat: '{{cronjob}}'
+                  legendFormat: '{{task}}'
                  datasource:
                    type: prometheus
                    uid: atlas-vm
@ -321,17 +338,166 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [10800]
+                        params: [691200]
                        type: gt
                      operator:
                        type: and
                      reducer:
                        type: last
                      type: query
-            noDataState: NoData
+            noDataState: OK
            execErrState: Error
            annotations:
-              summary: "Maintenance cronjob stale >3h since last success"
+              summary: "Ariadne image sweeper stale >8d since last success"
+            labels:
+              severity: warning
+          - uid: maint-cron-stale
+            title: "Maintenance CronJobs stale (legacy disabled)"
+            condition: C
+            for: "5m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 300
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  expr: vector(0)
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  legendFormat: legacy
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: gt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: OK
+            execErrState: OK
+            annotations:
+              summary: "Legacy cronjob alert disabled"
+            labels:
+              severity: info
+      - orgId: 1
+        name: ariadne
+        folder: Alerts
+        interval: 1m
+        rules:
+          - uid: ariadne-schedule-error
+            title: "Ariadne schedule task failed"
+            condition: C
+            for: "10m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 300
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  expr: max by (task) (ariadne_schedule_last_status{task=~"schedule\\..+"})
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  legendFormat: '{{task}}'
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: lt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: OK
+            execErrState: Error
+            annotations:
+              summary: "Ariadne schedule failed ({{ $labels.task }})"
+            labels:
+              severity: warning
+          - uid: ariadne-scheduler-stalled
+            title: "Ariadne scheduler behind (>15m)"
+            condition: C
+            for: "10m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 300
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  expr: time() - ariadne_schedule_next_run_timestamp_seconds{task=~"schedule\\..+"}
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  legendFormat: '{{task}}'
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [900]
+                        type: gt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: OK
+            execErrState: Error
+            annotations:
+              summary: "Ariadne scheduler behind for {{ $labels.task }}"
            labels:
              severity: warning
      - orgId: 1
@ -352,7 +518,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: POSTMARK_OUTBOUND_BOUNCE_RATE{window="1d"}
+                  expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
                  legendFormat: bounce 1d
                  datasource:
                    type: prometheus
@ -381,7 +547,7 @@ data:
                      reducer:
                        type: last
                      type: query
-            noDataState: NoData
+            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Postmark 1d bounce rate >5%"
@ -400,7 +566,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: POSTMARK_API_UP
+                  expr: max(postmark_api_up) or on() vector(0)
                  legendFormat: api up
                  datasource:
                    type: prometheus
@ -429,7 +595,7 @@ data:
                      reducer:
                        type: last
                      type: query
-            noDataState: NoData
+            noDataState: OK
            execErrState: Error
            annotations:
              summary: "Postmark exporter reports API down"
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@ -29,7 +29,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -98,7 +98,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
+              "expr": "sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@ -1134,7 +1134,7 @@ data:
        {
          "id": 17,
          "type": "stat",
-          "title": "Ariadne CI Coverage (%)",
+          "title": "Platform CI Coverage (%)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
          },
          "targets": [
            {
-              "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
+              "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
              "refId": "A",
              "legendFormat": "{{branch}}",
              "instant": true
@ -1192,12 +1192,13 @@ data:
              "values": false
            },
            "textMode": "value"
-          }
+          },
+          "description": "Internal source panel for Atlas Overview automation test rollups."
        },
        {
          "id": 18,
          "type": "table",
-          "title": "Ariadne CI Tests (latest)",
+          "title": "Platform CI Tests (latest)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1210,7 +1211,7 @@ data:
          },
          "targets": [
            {
-              "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
+              "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
              "refId": "A",
              "instant": true
            }
@ -1242,7 +1243,8 @@ data:
                "order": "desc"
              }
            }
-          ]
+          ],
+          "description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
        }
      ],
      "time": {
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -1686,7 +1686,7 @@ data:
        {
          "id": 42,
          "type": "timeseries",
-          "title": "Ariadne Test Success Rate",
+          "title": "Platform Test Success Rate",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
+              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
              "refId": "A"
            }
          ],
@ -1718,12 +1718,13 @@ data:
            "tooltip": {
              "mode": "multi"
            }
-          }
+          },
+          "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
        },
        {
          "id": 43,
          "type": "bargauge",
-          "title": "Tests with Failures (24h)",
+          "title": "Platform Tests with Failures (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1736,7 +1737,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
+              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
              "refId": "A",
              "legendFormat": "{{result}}",
              "instant": true
@ -1823,7 +1824,8 @@ data:
                "order": "desc"
              }
            }
-          ]
+          ],
+          "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
        },
        {
          "id": 11,
@ -1910,7 +1912,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))) / clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia.com/gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() (kube_node_labels{label_accelerator=~\".+\"} or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
+              "expr": "(100 * (sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) / clamp_min((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") * scalar((sum(sum by (namespace) ((sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))) / on(node) group_left() clamp_min(sum by (node) (sum by (namespace,node) (kube_pod_container_resource_requests{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\",$namespace_scope_gpu} * on(namespace,pod) group_left(node) kube_pod_info * on(node) group_left() ((max by (node) (kube_node_status_allocatable{resource=~\"nvidia[.]com/gpu.*|nvidia_com_gpu.*\"} > bool 0)) or kube_node_labels{label_jetson=\"true\"}))), 1) * on(node) group_left() (avg by (node) (label_replace(label_replace(DCGM_FI_DEV_GPU_UTIL, \"pod\", \"$1\", \"Hostname\", \"(.*)\"), \"namespace\", \"monitoring\", \"\", \"\") * on(namespace,pod) group_left(node) kube_pod_info{namespace=\"monitoring\"}) or max by (node) (jetson_gr3d_freq_percent{node!=\"\"})))) or on() vector(0)) == bool 0))",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -286,7 +286,7 @@ spec:
    podAnnotations:
      vault.hashicorp.com/agent-inject: "true"
      vault.hashicorp.com/role: "monitoring"
-      monitoring.bstein.dev/restart-rev: "1"
+      monitoring.bstein.dev/restart-rev: "6"
      vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
      vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
        {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}
--- a/services/vault/k8s-auth-config-cronjob.yaml
+++ b/services/vault/k8s-auth-config-cronjob.yaml
@ -43,6 +43,12 @@ spec:
                  value: /var/run/secrets/vault-token-reviewer/token
                - name: VAULT_K8S_ROLE_TTL
                  value: 1h
+                - name: VAULT_K8S_BOUND_AUDIENCES
+                  value: "https://kubernetes.default.svc,https://kubernetes.default.svc.cluster.local,k3s"
+                - name: VAULT_K8S_ISSUER
+                  value: https://kubernetes.default.svc.cluster.local
+                - name: VAULT_K8S_DISABLE_ISS_VALIDATION
+                  value: "false"
              volumeMounts:
                - name: k8s-auth-config-script
                  mountPath: /scripts
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@ -53,6 +53,8 @@ ensure_token
 k8s_host="https://${KUBERNETES_SERVICE_HOST}:443"
 k8s_ca="$(cat /var/run/secrets/kubernetes.io/serviceaccount/ca.crt)"
 k8s_token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
+k8s_issuer="${VAULT_K8S_ISSUER:-}"
+disable_iss_validation="${VAULT_K8S_DISABLE_ISS_VALIDATION:-true}"
 role_ttl="${VAULT_K8S_ROLE_TTL:-1h}"
 token_reviewer_jwt="${VAULT_K8S_TOKEN_REVIEWER_JWT:-}"

@ -68,11 +70,36 @@ if ! vault_cmd auth list -format=json | grep -q '"kubernetes/"'; then
  vault_cmd auth enable kubernetes
 fi

+ensure_default_policy_login() {
+  default_policy="$(vault_cmd policy read default)"
+  if printf '%s' "${default_policy}" | grep -q 'auth/kubernetes/login'; then
+    return
+  fi
+  log "updating default policy to allow kubernetes login"
+  default_policy="${default_policy}
+path \"auth/kubernetes/login\" {
+  capabilities = [\"create\", \"update\"]
+}
+"
+  printf '%s\n' "${default_policy}" | vault_cmd policy write default -
+}
+
 log "configuring kubernetes auth"
-vault_cmd write auth/kubernetes/config \
+if [ -n "${k8s_issuer}" ]; then
+  vault_cmd write auth/kubernetes/config \
+    token_reviewer_jwt="${token_reviewer_jwt}" \
+    kubernetes_host="${k8s_host}" \
+    kubernetes_ca_cert="${k8s_ca}" \
+    issuer="${k8s_issuer}" \
+    disable_iss_validation="${disable_iss_validation}"
+else
+  vault_cmd write auth/kubernetes/config \
    token_reviewer_jwt="${token_reviewer_jwt}" \
    kubernetes_host="${k8s_host}" \
    kubernetes_ca_cert="${k8s_ca}"
+fi
+
+ensure_default_policy_login

 write_raw_policy() {
  name="$1"
@ -87,6 +114,7 @@ write_policy_and_role() {
  service_accounts="$3"
  read_paths="$4"
  write_paths="$5"
+  audiences="${VAULT_K8S_BOUND_AUDIENCES:-}"

  policy_body=""
  for path in ${read_paths}; do
@ -109,11 +137,42 @@ path \"kv/metadata/atlas/${path}\" {
 }
 "
  done
+  if [ "${role}" = "maintenance" ]; then
+    policy_body="${policy_body}
+path \"sys/auth\" {
+  capabilities = [\"read\"]
+}
+path \"sys/auth/*\" {
+  capabilities = [\"create\", \"update\", \"read\", \"sudo\"]
+}
+path \"auth/kubernetes/*\" {
+  capabilities = [\"create\", \"update\", \"read\"]
+}
+path \"auth/oidc/*\" {
+  capabilities = [\"create\", \"update\", \"read\"]
+}
+path \"sys/policies/acl\" {
+  capabilities = [\"list\"]
+}
+path \"sys/policies/acl/*\" {
+  capabilities = [\"create\", \"update\", \"read\"]
+}
+"
+  fi

  log "writing policy ${role}"
  printf '%s\n' "${policy_body}" | vault_cmd policy write "${role}" -

  log "writing role ${role}"
+  if [ -n "${audiences}" ]; then
+    vault_cmd write "auth/kubernetes/role/${role}" \
+      bound_service_account_audiences="${audiences}" \
+      bound_service_account_names="${service_accounts}" \
+      bound_service_account_namespaces="${namespace}" \
+      policies="${role}" \
+      ttl="${role_ttl}"
+    return
+  fi
  vault_cmd write "auth/kubernetes/role/${role}" \
    bound_service_account_names="${service_accounts}" \
    bound_service_account_namespaces="${namespace}" \
@ -218,6 +277,8 @@ write_policy_and_role "nextcloud" "nextcloud" "nextcloud-vault" \
  "nextcloud/* shared/keycloak-admin shared/postmark-relay" ""
 write_policy_and_role "comms" "comms" "comms-vault,atlasbot" \
  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
+write_policy_and_role "ai" "ai" "atlasbot" \
+  "comms/* shared/chat-ai-keys-runtime shared/harbor-pull" ""
 write_policy_and_role "jenkins" "jenkins" "jenkins,jenkins-vault-sync" \
  "jenkins/* shared/harbor-pull" ""
 write_policy_and_role "monitoring" "monitoring" "monitoring-vault-sync" \
@ -231,7 +292,7 @@ write_policy_and_role "crypto" "crypto" "crypto-vault-sync" \
 write_policy_and_role "health" "health" "health-vault-sync" \
  "health/*" ""
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync" \
-  "maintenance/ariadne-db portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
+  "maintenance/ariadne-db maintenance/soteria-restic portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull" ""
 write_policy_and_role "finance" "finance" "finance-vault" \
  "finance/* shared/postmark-relay" ""
 write_policy_and_role "finance-secrets" "finance" "finance-secrets-ensure" \