402 changed files with 7785 additions and 33247 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,5 +6,3 @@ __pycache__/
 *.py[cod]
 .pytest_cache
 .venv
-.venv-ci
-tmp/
--- a/77
+++ b/77
@ -1,77 +0,0 @@
-// Mirror of ci/Jenkinsfile.titan-iac for multibranch discovery.
-pipeline {
-  agent {
-    kubernetes {
-      defaultContainer 'python'
-      yaml """
-apiVersion: v1
-kind: Pod
-spec:
-  nodeSelector:
-    hardware: rpi5
-    kubernetes.io/arch: arm64
-    node-role.kubernetes.io/worker: "true"
-  containers:
-    - name: python
-      image: python:3.12-slim
-      command:
-        - cat
-      tty: true
-"""
-    }
-  }
-  environment {
-    PIP_DISABLE_PIP_VERSION_CHECK = '1'
-    PYTHONUNBUFFERED = '1'
-  }
-  stages {
-    stage('Checkout') {
-      steps {
-        checkout scm
-      }
-    }
-    stage('Install deps') {
-      steps {
-        sh 'pip install --no-cache-dir -r ci/requirements.txt'
-      }
-    }
-    stage('Glue tests') {
-      steps {
-        sh 'pytest -q ci/tests/glue'
-      }
-    }
-    stage('Resolve Flux branch') {
-      steps {
-        script {
-          env.FLUX_BRANCH = sh(
-            returnStdout: true,
-            script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
-          ).trim()
-          if (!env.FLUX_BRANCH) {
-            error('Flux branch not found in gotk-sync.yaml')
-          }
-          echo "Flux branch: ${env.FLUX_BRANCH}"
-        }
-      }
-    }
-    stage('Promote') {
-      when {
-        expression {
-          def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
-          return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
-        }
-      }
-      steps {
-        withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
-          sh '''
-            set +x
-            git config user.email "jenkins@bstein.dev"
-            git config user.name "jenkins"
-            git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
-            git push origin HEAD:${FLUX_BRANCH}
-          '''
-        }
-      }
-    }
-  }
-}
--- a/ci/Jenkinsfile.titan-iac
+++ b/ci/Jenkinsfile.titan-iac
@ -1,76 +0,0 @@
-pipeline {
-  agent {
-    kubernetes {
-      defaultContainer 'python'
-      yaml """
-apiVersion: v1
-kind: Pod
-spec:
-  nodeSelector:
-    hardware: rpi5
-    kubernetes.io/arch: arm64
-    node-role.kubernetes.io/worker: "true"
-  containers:
-    - name: python
-      image: python:3.12-slim
-      command:
-        - cat
-      tty: true
-"""
-    }
-  }
-  environment {
-    PIP_DISABLE_PIP_VERSION_CHECK = '1'
-    PYTHONUNBUFFERED = '1'
-  }
-  stages {
-    stage('Checkout') {
-      steps {
-        checkout scm
-      }
-    }
-    stage('Install deps') {
-      steps {
-        sh 'pip install --no-cache-dir -r ci/requirements.txt'
-      }
-    }
-    stage('Glue tests') {
-      steps {
-        sh 'pytest -q ci/tests/glue'
-      }
-    }
-    stage('Resolve Flux branch') {
-      steps {
-        script {
-          env.FLUX_BRANCH = sh(
-            returnStdout: true,
-            script: "awk '/branch:/{print $2; exit}' clusters/atlas/flux-system/gotk-sync.yaml"
-          ).trim()
-          if (!env.FLUX_BRANCH) {
-            error('Flux branch not found in gotk-sync.yaml')
-          }
-          echo "Flux branch: ${env.FLUX_BRANCH}"
-        }
-      }
-    }
-    stage('Promote') {
-      when {
-        expression {
-          def branch = env.BRANCH_NAME ?: (env.GIT_BRANCH ?: '').replaceFirst('origin/', '')
-          return env.FLUX_BRANCH && branch == env.FLUX_BRANCH
-        }
-      }
-      steps {
-        withCredentials([usernamePassword(credentialsId: 'gitea-pat', usernameVariable: 'GIT_USER', passwordVariable: 'GIT_TOKEN')]) {
-          sh '''
-            set +x
-            git config user.email "jenkins@bstein.dev"
-            git config user.name "jenkins"
-            git remote set-url origin https://${GIT_USER}:${GIT_TOKEN}@scm.bstein.dev/bstein/titan-iac.git
-            git push origin HEAD:${FLUX_BRANCH}
-          '''
-        }
-      }
-    }
-  }
-}
--- a/ci/requirements.txt
+++ b/ci/requirements.txt
@ -1,4 +0,0 @@
-pytest==8.3.4
-kubernetes==30.1.0
-PyYAML==6.0.2
-requests==2.32.3
--- a/ci/tests/glue/config.yaml
+++ b/ci/tests/glue/config.yaml
@ -1,16 +0,0 @@
-max_success_age_hours: 48
-allow_suspended:
-  - bstein-dev-home/vaultwarden-cred-sync
-  - comms/othrys-room-reset
-  - comms/pin-othrys-invite
-  - comms/seed-othrys-room
-  - finance/firefly-user-sync
-  - health/wger-admin-ensure
-  - health/wger-user-sync
-  - mailu-mailserver/mailu-sync-nightly
-  - nextcloud/nextcloud-mail-sync
-ariadne_schedule_tasks:
-  - schedule.mailu_sync
-  - schedule.nextcloud_sync
-  - schedule.vaultwarden_sync
-  - schedule.wger_admin
--- a/ci/tests/glue/test_glue_cronjobs.py
+++ b/ci/tests/glue/test_glue_cronjobs.py
@ -1,46 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime, timezone
-from pathlib import Path
-
-import yaml
-from kubernetes import client, config
-
-
-CONFIG_PATH = Path(__file__).with_name("config.yaml")
-
-
-def _load_config() -> dict:
-    with CONFIG_PATH.open("r", encoding="utf-8") as handle:
-        return yaml.safe_load(handle) or {}
-
-
-def _load_kube():
-    try:
-        config.load_incluster_config()
-    except config.ConfigException:
-        config.load_kube_config()
-
-
-def test_glue_cronjobs_recent_success():
-    cfg = _load_config()
-    max_age_hours = int(cfg.get("max_success_age_hours", 48))
-    allow_suspended = set(cfg.get("allow_suspended", []))
-
-    _load_kube()
-    batch = client.BatchV1Api()
-    cronjobs = batch.list_cron_job_for_all_namespaces(label_selector="atlas.bstein.dev/glue=true").items
-
-    assert cronjobs, "No glue cronjobs found with atlas.bstein.dev/glue=true"
-
-    now = datetime.now(timezone.utc)
-    for cronjob in cronjobs:
-        name = f"{cronjob.metadata.namespace}/{cronjob.metadata.name}"
-        if cronjob.spec.suspend:
-            assert name in allow_suspended, f"{name} is suspended but not in allow_suspended"
-            continue
-
-        last_success = cronjob.status.last_successful_time
-        assert last_success is not None, f"{name} has no lastSuccessfulTime"
-        age_hours = (now - last_success).total_seconds() / 3600
-        assert age_hours <= max_age_hours, f"{name} last success {age_hours:.1f}h ago"
--- a/ci/tests/glue/test_glue_metrics.py
+++ b/ci/tests/glue/test_glue_metrics.py
@ -1,48 +0,0 @@
-from __future__ import annotations
-
-import os
-from pathlib import Path
-
-import requests
-import yaml
-
-
-VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server:8428").rstrip("/")
-CONFIG_PATH = Path(__file__).with_name("config.yaml")
-
-
-def _load_config() -> dict:
-    with CONFIG_PATH.open("r", encoding="utf-8") as handle:
-        return yaml.safe_load(handle) or {}
-
-
-def _query(promql: str) -> list[dict]:
-    response = requests.get(f"{VM_URL}/api/v1/query", params={"query": promql}, timeout=10)
-    response.raise_for_status()
-    payload = response.json()
-    return payload.get("data", {}).get("result", [])
-
-
-def test_glue_metrics_present():
-    series = _query('kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}')
-    assert series, "No glue cronjob label series found"
-
-
-def test_glue_metrics_success_join():
-    query = (
-        "kube_cronjob_status_last_successful_time "
-        'and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue="true"}'
-    )
-    series = _query(query)
-    assert series, "No glue cronjob last success series found"
-
-
-def test_ariadne_schedule_metrics_present():
-    cfg = _load_config()
-    expected = cfg.get("ariadne_schedule_tasks", [])
-    if not expected:
-        return
-    series = _query("ariadne_schedule_next_run_timestamp_seconds")
-    tasks = {item.get("metric", {}).get("task") for item in series}
-    missing = [task for task in expected if task not in tasks]
-    assert not missing, f"Missing Ariadne schedule metrics for: {', '.join(missing)}"
--- a/clusters/atlas/applications/kustomization.yaml
+++ b/clusters/atlas/applications/kustomization.yaml
@ -0,0 +1,13 @@
+# clusters/atlas/applications/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ../../services/crypto
+  - ../../services/gitea
+  - ../../services/jellyfin
+  - ../../services/comms
+  - ../../services/monitoring
+  - ../../services/logging
+  - ../../services/pegasus
+  - ../../services/vault
+  - ../../services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
@ -1,17 +0,0 @@
-# clusters/atlas/flux-system/applications/bstein-dev-home-migrations/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: bstein-dev-home-migrations
-  namespace: flux-system
-spec:
-  interval: 10m
-  path: ./services/bstein-dev-home/oneoffs/migrations
-  prune: true
-  force: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-  targetNamespace: bstein-dev-home
-  wait: false
-  suspend: true
--- a/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/bstein-dev-home/image-automation.yaml
@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
  name: bstein-dev-home
-  namespace: bstein-dev-home
+  namespace: flux-system
 spec:
  interval: 1m0s
  sourceRef:
@ -13,14 +13,14 @@ spec:
  git:
    checkout:
      ref:
-        branch: feature/ariadne
+        branch: main
    commit:
      author:
        email: ops@bstein.dev
        name: flux-bot
-      messageTemplate: "chore(bstein-dev-home): automated image update"
+      messageTemplate: "chore(bstein-dev-home): update images to {{range .Updated.Images}}{{.}}{{end}}"
    push:
-      branch: feature/ariadne
+      branch: main
  update:
    strategy: Setters
    path: services/bstein-dev-home
--- a/clusters/atlas/flux-system/applications/comms/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/comms/kustomization.yaml
@ -1,4 +1,4 @@
-# clusters/atlas/flux-system/applications/comms/kustomization.yaml
+# clusters/atlas/flux-system/applications/communication/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
--- a/clusters/atlas/flux-system/applications/finance/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/finance/kustomization.yaml
@ -1,24 +0,0 @@
-# clusters/atlas/flux-system/applications/finance/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: finance
-  namespace: flux-system
-spec:
-  interval: 10m
-  path: ./services/finance
-  prune: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-  targetNamespace: finance
-  healthChecks:
-    - apiVersion: apps/v1
-      kind: Deployment
-      name: actual-budget
-      namespace: finance
-    - apiVersion: apps/v1
-      kind: Deployment
-      name: firefly
-      namespace: finance
-  wait: false
--- a/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/harbor/kustomization.yaml
@ -13,6 +13,11 @@ spec:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
+  healthChecks:
+    - apiVersion: helm.toolkit.fluxcd.io/v2
+      kind: HelmRelease
+      name: harbor
+      namespace: harbor
  wait: false
  dependsOn:
    - name: core
--- a/clusters/atlas/flux-system/applications/health/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/health/kustomization.yaml
@ -1,25 +0,0 @@
-# clusters/atlas/flux-system/applications/health/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: health
-  namespace: flux-system
-spec:
-  interval: 10m
-  path: ./services/health
-  prune: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-  targetNamespace: health
-  dependsOn:
-    - name: keycloak
-    - name: postgres
-    - name: traefik
-    - name: vault
-  healthChecks:
-    - apiVersion: apps/v1
-      kind: Deployment
-      name: wger
-      namespace: health
-  wait: false
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -12,12 +12,10 @@ resources:
  - pegasus/image-automation.yaml
  - bstein-dev-home/kustomization.yaml
  - bstein-dev-home/image-automation.yaml
-  - bstein-dev-home-migrations/kustomization.yaml
  - harbor/kustomization.yaml
  - harbor/image-automation.yaml
  - jellyfin/kustomization.yaml
  - xmr-miner/kustomization.yaml
-  - wallet-monero-temp/kustomization.yaml
  - sui-metrics/kustomization.yaml
  - openldap/kustomization.yaml
  - keycloak/kustomization.yaml
@ -29,5 +27,3 @@ resources:
  - nextcloud-mail-sync/kustomization.yaml
  - outline/kustomization.yaml
  - planka/kustomization.yaml
-  - finance/kustomization.yaml
-  - health/kustomization.yaml
--- a/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/pegasus/image-automation.yaml
@ -3,7 +3,7 @@ apiVersion: image.toolkit.fluxcd.io/v1
 kind: ImageUpdateAutomation
 metadata:
  name: pegasus
-  namespace: jellyfin
+  namespace: flux-system
 spec:
  interval: 1m0s
  sourceRef:
--- a/clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
@ -1,19 +0,0 @@
-# clusters/atlas/flux-system/applications/wallet-monero-temp/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: wallet-monero-temp
-  namespace: flux-system
-spec:
-  interval: 10m
-  path: ./services/crypto/wallet-monero-temp
-  targetNamespace: crypto
-  prune: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  dependsOn:
-    - name: crypto
-    - name: xmr-miner
-  wait: true
--- a/clusters/atlas/flux-system/gotk-components.yaml
+++ b/clusters/atlas/flux-system/gotk-components.yaml
@ -1,4 +1,3 @@
-# clusters/atlas/flux-system/gotk-components.yaml
 ---
 # This manifest was generated by flux. DO NOT EDIT.
 # Flux Version: v2.7.5
--- a/clusters/atlas/flux-system/gotk-sync.yaml
+++ b/clusters/atlas/flux-system/gotk-sync.yaml
@ -1,4 +1,3 @@
-# clusters/atlas/flux-system/gotk-sync.yaml
 # This manifest was generated by flux. DO NOT EDIT.
 ---
 apiVersion: source.toolkit.fluxcd.io/v1
@ -9,7 +8,7 @@ metadata:
 spec:
  interval: 1m0s
  ref:
-    branch: feature/ariadne
+    branch: feature/sso-hardening
  secretRef:
    name: flux-system-gitea
  url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git
--- a/clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
@ -1,17 +0,0 @@
-# clusters/atlas/flux-system/platform/cert-manager-cleanup/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: cert-manager-cleanup
-  namespace: flux-system
-spec:
-  interval: 30m
-  path: ./infrastructure/cert-manager/cleanup
-  prune: true
-  force: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  targetNamespace: cert-manager
-  wait: true
--- a/clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
@ -1,19 +0,0 @@
-# clusters/atlas/flux-system/platform/cert-manager/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: cert-manager
-  namespace: flux-system
-spec:
-  interval: 30m
-  path: ./infrastructure/cert-manager
-  prune: true
-  force: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  targetNamespace: cert-manager
-  dependsOn:
-    - name: helm
-  wait: true
--- a/clusters/atlas/flux-system/platform/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/kustomization.yaml
@ -4,17 +4,12 @@ kind: Kustomization
 resources:
  - core/kustomization.yaml
  - helm/kustomization.yaml
-  - cert-manager/kustomization.yaml
  - metallb/kustomization.yaml
  - traefik/kustomization.yaml
  - gitops-ui/kustomization.yaml
  - monitoring/kustomization.yaml
  - logging/kustomization.yaml
  - maintenance/kustomization.yaml
-  - maintenance/image-automation.yaml
-  - longhorn-adopt/kustomization.yaml
-  - longhorn/kustomization.yaml
  - longhorn-ui/kustomization.yaml
  - postgres/kustomization.yaml
  - ../platform/vault-csi/kustomization.yaml
-  - ../platform/vault-injector/kustomization.yaml
--- a/clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
@ -1,17 +0,0 @@
-# clusters/atlas/flux-system/platform/longhorn-adopt/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: longhorn-adopt
-  namespace: flux-system
-spec:
-  interval: 30m
-  path: ./infrastructure/longhorn/adopt
-  prune: true
-  force: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  targetNamespace: longhorn-system
-  wait: true
--- a/clusters/atlas/flux-system/platform/longhorn-ui/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/longhorn-ui/kustomization.yaml
@ -15,5 +15,4 @@ spec:
    namespace: flux-system
  dependsOn:
    - name: core
-    - name: longhorn
  wait: true
--- a/clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
@ -1,20 +0,0 @@
-# clusters/atlas/flux-system/platform/longhorn/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: longhorn
-  namespace: flux-system
-spec:
-  interval: 30m
-  path: ./infrastructure/longhorn/core
-  prune: true
-  force: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  targetNamespace: longhorn-system
-  dependsOn:
-    - name: helm
-    - name: longhorn-adopt
-  wait: false
--- a/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
@ -1,26 +0,0 @@
-# clusters/atlas/flux-system/platform/maintenance/image-automation.yaml
-apiVersion: image.toolkit.fluxcd.io/v1
-kind: ImageUpdateAutomation
-metadata:
-  name: maintenance
-  namespace: maintenance
-spec:
-  interval: 1m0s
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  git:
-    checkout:
-      ref:
-        branch: feature/ariadne
-    commit:
-      author:
-        email: ops@bstein.dev
-        name: flux-bot
-      messageTemplate: "chore(maintenance): automated image update"
-    push:
-      branch: feature/ariadne
-  update:
-    strategy: Setters
-    path: services/maintenance
--- a/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/maintenance/kustomization.yaml
@ -8,7 +8,6 @@ spec:
  interval: 10m
  path: ./services/maintenance
  prune: true
-  force: true
  sourceRef:
    kind: GitRepository
    name: flux-system
--- a/clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
+++ b/clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
@ -1,16 +0,0 @@
-# clusters/atlas/flux-system/platform/vault-injector/kustomization.yaml
-apiVersion: kustomize.toolkit.fluxcd.io/v1
-kind: Kustomization
-metadata:
-  name: vault-injector
-  namespace: flux-system
-spec:
-  interval: 30m
-  path: ./infrastructure/vault-injector
-  targetNamespace: vault
-  prune: true
-  sourceRef:
-    kind: GitRepository
-    name: flux-system
-    namespace: flux-system
-  wait: true
--- a/clusters/atlas/platform/kustomization.yaml
+++ b/clusters/atlas/platform/kustomization.yaml
@ -0,0 +1,8 @@
+# clusters/atlas/platform/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ../../../infrastructure/modules/base
+  - ../../../infrastructure/modules/profiles/atlas-ha
+  - ../../../infrastructure/sources/cert-manager/letsencrypt.yaml
+  - ../../../infrastructure/metallb
--- a/dockerfiles/Dockerfile.comms-guest-tools
+++ b/dockerfiles/Dockerfile.comms-guest-tools
@ -1,5 +0,0 @@
-FROM python:3.11-slim
-
-ENV PIP_DISABLE_PIP_VERSION_CHECK=1
-
-RUN pip install --no-cache-dir requests psycopg2-binary
--- a/dockerfiles/Dockerfile.harbor-core-vault
+++ b/dockerfiles/Dockerfile.harbor-core-vault
@ -1,9 +0,0 @@
-FROM registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
-
-USER root
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-USER harbor
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/harbor/entrypoint.sh"]
--- a/dockerfiles/Dockerfile.harbor-jobservice-vault
+++ b/dockerfiles/Dockerfile.harbor-jobservice-vault
@ -1,9 +0,0 @@
-FROM registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
-
-USER root
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-USER harbor
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/harbor/entrypoint.sh"]
--- a/dockerfiles/Dockerfile.harbor-registry-vault
+++ b/dockerfiles/Dockerfile.harbor-registry-vault
@ -1,9 +0,0 @@
-FROM registry.bstein.dev/infra/harbor-registry:v2.14.1-arm64
-
-USER root
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-USER harbor
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/home/harbor/entrypoint.sh"]
--- a/dockerfiles/Dockerfile.harbor-registryctl-vault
+++ b/dockerfiles/Dockerfile.harbor-registryctl-vault
@ -1,9 +0,0 @@
-FROM registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
-
-USER root
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-USER harbor
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/home/harbor/start.sh"]
--- a/dockerfiles/Dockerfile.livekit-token-vault
+++ b/dockerfiles/Dockerfile.livekit-token-vault
@ -1,10 +0,0 @@
-FROM ghcr.io/element-hq/lk-jwt-service:0.3.0 AS base
-
-FROM alpine:3.20
-RUN apk add --no-cache ca-certificates
-COPY --from=base /lk-jwt-service /lk-jwt-service
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/lk-jwt-service"]
--- a/dockerfiles/Dockerfile.oauth2-proxy-vault
+++ b/dockerfiles/Dockerfile.oauth2-proxy-vault
@ -1,10 +0,0 @@
-FROM quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 AS base
-
-FROM alpine:3.20
-RUN apk add --no-cache ca-certificates
-COPY --from=base /bin/oauth2-proxy /bin/oauth2-proxy
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/bin/oauth2-proxy"]
--- a/dockerfiles/Dockerfile.pegasus-vault
+++ b/dockerfiles/Dockerfile.pegasus-vault
@ -1,10 +0,0 @@
-FROM registry.bstein.dev/streaming/pegasus:1.2.32 AS base
-
-FROM alpine:3.20
-RUN apk add --no-cache ca-certificates
-COPY --from=base /pegasus /pegasus
-COPY dockerfiles/vault-entrypoint.sh /entrypoint.sh
-RUN chmod 0755 /entrypoint.sh
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD ["/pegasus"]
--- a/dockerfiles/vault-entrypoint.sh
+++ b/dockerfiles/vault-entrypoint.sh
@ -1,34 +0,0 @@
-#!/bin/sh
-set -eu
-
-if [ -n "${VAULT_ENV_FILE:-}" ]; then
-  if [ -f "${VAULT_ENV_FILE}" ]; then
-    # shellcheck disable=SC1090
-    . "${VAULT_ENV_FILE}"
-  else
-    echo "Vault env file not found: ${VAULT_ENV_FILE}" >&2
-    exit 1
-  fi
-fi
-
-if [ -n "${VAULT_COPY_FILES:-}" ]; then
-  old_ifs="$IFS"
-  IFS=','
-  for pair in ${VAULT_COPY_FILES}; do
-    src="${pair%%:*}"
-    dest="${pair#*:}"
-    if [ -z "${src}" ] || [ -z "${dest}" ]; then
-      echo "Vault copy entry malformed: ${pair}" >&2
-      exit 1
-    fi
-    if [ ! -f "${src}" ]; then
-      echo "Vault file not found: ${src}" >&2
-      exit 1
-    fi
-    mkdir -p "$(dirname "${dest}")"
-    cp "${src}" "${dest}"
-  done
-  IFS="$old_ifs"
-fi
-
-exec "$@"
--- a/infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
+++ b/infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
@ -1,40 +0,0 @@
-# infrastructure/cert-manager/cleanup/cert-manager-cleanup-job.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: cert-manager-cleanup-2
-  namespace: cert-manager
-spec:
-  backoffLimit: 1
-  template:
-    spec:
-      serviceAccountName: cert-manager-cleanup
-      restartPolicy: Never
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node-role.kubernetes.io/worker
-                    operator: Exists
-          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              preference:
-                matchExpressions:
-                  - key: kubernetes.io/arch
-                    operator: In
-                    values: ["arm64"]
-      containers:
-        - name: cleanup
-          image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
-          command: ["/usr/bin/env", "bash"]
-          args: ["/scripts/cert_manager_cleanup.sh"]
-          volumeMounts:
-            - name: script
-              mountPath: /scripts
-              readOnly: true
-      volumes:
-        - name: script
-          configMap:
-            name: cert-manager-cleanup-script
-            defaultMode: 0555
--- a/infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
+++ b/infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
@ -1,58 +0,0 @@
-# infrastructure/cert-manager/cleanup/cert-manager-cleanup-rbac.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: cert-manager-cleanup
-  namespace: cert-manager
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: cert-manager-cleanup
-rules:
-  - apiGroups: [""]
-    resources:
-      - pods
-      - services
-      - endpoints
-      - configmaps
-      - secrets
-      - serviceaccounts
-    verbs: ["get", "list", "watch", "delete"]
-  - apiGroups: ["apps"]
-    resources:
-      - deployments
-      - daemonsets
-      - statefulsets
-      - replicasets
-    verbs: ["get", "list", "watch", "delete"]
-  - apiGroups: ["batch"]
-    resources:
-      - jobs
-      - cronjobs
-    verbs: ["get", "list", "watch", "delete"]
-  - apiGroups: ["rbac.authorization.k8s.io"]
-    resources:
-      - roles
-      - rolebindings
-      - clusterroles
-      - clusterrolebindings
-    verbs: ["get", "list", "watch", "delete"]
-  - apiGroups: ["admissionregistration.k8s.io"]
-    resources:
-      - validatingwebhookconfigurations
-      - mutatingwebhookconfigurations
-    verbs: ["get", "list", "watch", "delete"]
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: cert-manager-cleanup
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: cert-manager-cleanup
-subjects:
-  - kind: ServiceAccount
-    name: cert-manager-cleanup
-    namespace: cert-manager
--- a/infrastructure/cert-manager/cleanup/kustomization.yaml
+++ b/infrastructure/cert-manager/cleanup/kustomization.yaml
@ -1,15 +0,0 @@
-# infrastructure/cert-manager/cleanup/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - cert-manager-cleanup-rbac.yaml
-  - cert-manager-cleanup-job.yaml
-
-configMapGenerator:
-  - name: cert-manager-cleanup-script
-    namespace: cert-manager
-    files:
-      - cert_manager_cleanup.sh=scripts/cert_manager_cleanup.sh
-    options:
-      disableNameSuffixHash: true
--- a/infrastructure/cert-manager/cleanup/namespace.yaml
+++ b/infrastructure/cert-manager/cleanup/namespace.yaml
@ -1,5 +0,0 @@
-# infrastructure/cert-manager/cleanup/namespace.yaml
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: cert-manager
--- a/infrastructure/cert-manager/cleanup/scripts/cert_manager_cleanup.sh
+++ b/infrastructure/cert-manager/cleanup/scripts/cert_manager_cleanup.sh
@ -1,37 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-namespace="cert-manager"
-selectors=(
-  "app.kubernetes.io/name=cert-manager"
-  "app.kubernetes.io/instance=cert-manager"
-  "app.kubernetes.io/instance=certmanager-prod"
-)
-
-delete_namespaced() {
-  local selector="$1"
-  kubectl -n "${namespace}" delete deployment,daemonset,statefulset,replicaset \
-    --selector "${selector}" --ignore-not-found --wait=false
-  kubectl -n "${namespace}" delete pod,service,endpoints,serviceaccount,configmap,secret \
-    --selector "${selector}" --ignore-not-found --wait=false
-  kubectl -n "${namespace}" delete role,rolebinding \
-    --selector "${selector}" --ignore-not-found --wait=false
-  kubectl -n "${namespace}" delete job,cronjob \
-    --selector "${selector}" --ignore-not-found --wait=false
-}
-
-delete_cluster_scoped() {
-  local selector="$1"
-  kubectl delete clusterrole,clusterrolebinding \
-    --selector "${selector}" --ignore-not-found --wait=false
-  kubectl delete mutatingwebhookconfiguration,validatingwebhookconfiguration \
-    --selector "${selector}" --ignore-not-found --wait=false
-}
-
-for selector in "${selectors[@]}"; do
-  delete_namespaced "${selector}"
-  delete_cluster_scoped "${selector}"
-done
-
-kubectl delete mutatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
-kubectl delete validatingwebhookconfiguration cert-manager-webhook --ignore-not-found --wait=false
--- a/infrastructure/cert-manager/helmrelease.yaml
+++ b/infrastructure/cert-manager/helmrelease.yaml
@ -1,67 +0,0 @@
-# infrastructure/cert-manager/helmrelease.yaml
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
-  name: cert-manager
-  namespace: cert-manager
-spec:
-  interval: 30m
-  chart:
-    spec:
-      chart: cert-manager
-      version: v1.17.0
-      sourceRef:
-        kind: HelmRepository
-        name: jetstack
-        namespace: flux-system
-  install:
-    crds: CreateReplace
-    remediation: { retries: 3 }
-    timeout: 10m
-  upgrade:
-    crds: CreateReplace
-    remediation:
-      retries: 3
-      remediateLastFailure: true
-    cleanupOnFail: true
-    timeout: 10m
-  values:
-    installCRDs: true
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-    affinity:
-      nodeAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          nodeSelectorTerms:
-            - matchExpressions:
-                - key: hardware
-                  operator: In
-                  values:
-                    - rpi5
-                    - rpi4
-    webhook:
-      nodeSelector:
-        node-role.kubernetes.io/worker: "true"
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
-    cainjector:
-      nodeSelector:
-        node-role.kubernetes.io/worker: "true"
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
--- a/infrastructure/cert-manager/kustomization.yaml
+++ b/infrastructure/cert-manager/kustomization.yaml
@ -1,6 +0,0 @@
-# infrastructure/cert-manager/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - helmrelease.yaml
--- a/infrastructure/cert-manager/namespace.yaml
+++ b/infrastructure/cert-manager/namespace.yaml
@ -1,5 +0,0 @@
-# infrastructure/cert-manager/namespace.yaml
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: cert-manager
--- a/infrastructure/core/coredns-custom.yaml
+++ b/infrastructure/core/coredns-custom.yaml
@ -1,47 +0,0 @@
-# infrastructure/core/coredns-custom.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: coredns-custom
-  namespace: kube-system
-data:
-  bstein-dev.server: |
-    bstein.dev:53 {
-      errors
-      cache 30
-      hosts {
-        192.168.22.9 alerts.bstein.dev
-        192.168.22.9 auth.bstein.dev
-        192.168.22.9 bstein.dev
-        10.43.6.87 budget.bstein.dev
-        192.168.22.9 call.live.bstein.dev
-        192.168.22.9 cd.bstein.dev
-        192.168.22.9 chat.ai.bstein.dev
-        192.168.22.9 ci.bstein.dev
-        192.168.22.9 cloud.bstein.dev
-        192.168.22.9 health.bstein.dev
-        192.168.22.9 kit.live.bstein.dev
-        192.168.22.9 live.bstein.dev
-        192.168.22.9 logs.bstein.dev
-        192.168.22.9 longhorn.bstein.dev
-        192.168.22.4 mail.bstein.dev
-        192.168.22.9 matrix.live.bstein.dev
-        192.168.22.9 metrics.bstein.dev
-        192.168.22.9 monero.bstein.dev
-        10.43.6.87 money.bstein.dev
-        192.168.22.9 notes.bstein.dev
-        192.168.22.9 office.bstein.dev
-        192.168.22.9 pegasus.bstein.dev
-        3.136.224.193 pm-bounces.bstein.dev
-        3.150.68.49 pm-bounces.bstein.dev
-        18.189.137.81 pm-bounces.bstein.dev
-        192.168.22.9 registry.bstein.dev
-        192.168.22.9 scm.bstein.dev
-        192.168.22.9 secret.bstein.dev
-        192.168.22.9 sso.bstein.dev
-        192.168.22.9 stream.bstein.dev
-        192.168.22.9 tasks.bstein.dev
-        192.168.22.9 vault.bstein.dev
-        fallthrough
-      }
-    }
--- a/infrastructure/core/coredns-deployment.yaml
+++ b/infrastructure/core/coredns-deployment.yaml
@ -1,141 +0,0 @@
-# infrastructure/core/coredns-deployment.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: coredns
-  namespace: kube-system
-  labels:
-    k8s-app: kube-dns
-    kubernetes.io/name: CoreDNS
-spec:
-  progressDeadlineSeconds: 600
-  replicas: 2
-  revisionHistoryLimit: 0
-  selector:
-    matchLabels:
-      k8s-app: kube-dns
-  strategy:
-    type: RollingUpdate
-    rollingUpdate:
-      maxSurge: 25%
-      maxUnavailable: 1
-  template:
-    metadata:
-      labels:
-        k8s-app: kube-dns
-    spec:
-      containers:
-        - name: coredns
-          image: registry.bstein.dev/infra/coredns:1.12.1
-          imagePullPolicy: IfNotPresent
-          args:
-            - -conf
-            - /etc/coredns/Corefile
-          ports:
-            - containerPort: 53
-              name: dns
-              protocol: UDP
-            - containerPort: 53
-              name: dns-tcp
-              protocol: TCP
-            - containerPort: 9153
-              name: metrics
-              protocol: TCP
-          livenessProbe:
-            httpGet:
-              path: /health
-              port: 8080
-              scheme: HTTP
-            initialDelaySeconds: 60
-            periodSeconds: 10
-            timeoutSeconds: 1
-            successThreshold: 1
-            failureThreshold: 3
-          readinessProbe:
-            httpGet:
-              path: /ready
-              port: 8181
-              scheme: HTTP
-            periodSeconds: 2
-            timeoutSeconds: 1
-            successThreshold: 1
-            failureThreshold: 3
-          resources:
-            limits:
-              memory: 170Mi
-            requests:
-              cpu: 100m
-              memory: 70Mi
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              add:
-                - NET_BIND_SERVICE
-              drop:
-                - all
-            readOnlyRootFilesystem: true
-          volumeMounts:
-            - name: config-volume
-              mountPath: /etc/coredns
-              readOnly: true
-            - name: custom-config-volume
-              mountPath: /etc/coredns/custom
-              readOnly: true
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi5
-                      - rpi4
-                  - key: node-role.kubernetes.io/worker
-                    operator: In
-                    values:
-                      - "true"
-      dnsPolicy: Default
-      nodeSelector:
-        kubernetes.io/os: linux
-      priorityClassName: system-cluster-critical
-      restartPolicy: Always
-      schedulerName: default-scheduler
-      serviceAccountName: coredns
-      tolerations:
-        - key: CriticalAddonsOnly
-          operator: Exists
-        - key: node-role.kubernetes.io/control-plane
-          operator: Exists
-          effect: NoSchedule
-        - key: node-role.kubernetes.io/master
-          operator: Exists
-          effect: NoSchedule
-      topologySpreadConstraints:
-        - maxSkew: 1
-          topologyKey: kubernetes.io/hostname
-          whenUnsatisfiable: DoNotSchedule
-          labelSelector:
-            matchLabels:
-              k8s-app: kube-dns
-        - maxSkew: 1
-          topologyKey: topology.kubernetes.io/zone
-          whenUnsatisfiable: ScheduleAnyway
-          labelSelector:
-            matchLabels:
-              k8s-app: kube-dns
-      volumes:
-        - name: config-volume
-          configMap:
-            name: coredns
-            defaultMode: 420
-            items:
-              - key: Corefile
-                path: Corefile
-              - key: NodeHosts
-                path: NodeHosts
-        - name: custom-config-volume
-          configMap:
-            name: coredns-custom
-            optional: true
-            defaultMode: 420
--- a/infrastructure/core/kustomization.yaml
+++ b/infrastructure/core/kustomization.yaml
@ -4,8 +4,5 @@ kind: Kustomization
 resources:
  - ../modules/base
  - ../modules/profiles/atlas-ha
-  - coredns-custom.yaml
-  - coredns-deployment.yaml
-  - ntp-sync-daemonset.yaml
  - ../sources/cert-manager/letsencrypt.yaml
  - ../sources/cert-manager/letsencrypt-prod.yaml
--- a/infrastructure/core/ntp-sync-daemonset.yaml
+++ b/infrastructure/core/ntp-sync-daemonset.yaml
@ -1,50 +0,0 @@
-# infrastructure/core/ntp-sync-daemonset.yaml
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
-  name: ntp-sync
-  namespace: kube-system
-  labels:
-    app: ntp-sync
-spec:
-  selector:
-    matchLabels:
-      app: ntp-sync
-  template:
-    metadata:
-      labels:
-        app: ntp-sync
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node-role.kubernetes.io/control-plane
-                    operator: DoesNotExist
-                  - key: node-role.kubernetes.io/master
-                    operator: DoesNotExist
-      containers:
-        - name: ntp-sync
-          image: public.ecr.aws/docker/library/busybox:1.36.1
-          imagePullPolicy: IfNotPresent
-          command: ["/bin/sh", "-c"]
-          args:
-            - |
-              set -eu
-              while true; do
-                ntpd -q -p pool.ntp.org || true
-                sleep 300
-              done
-          securityContext:
-            capabilities:
-              add: ["SYS_TIME"]
-            runAsUser: 0
-            runAsGroup: 0
-          resources:
-            requests:
-              cpu: 10m
-              memory: 16Mi
-            limits:
-              cpu: 50m
-              memory: 64Mi
--- a/infrastructure/longhorn/adopt/kustomization.yaml
+++ b/infrastructure/longhorn/adopt/kustomization.yaml
@ -1,15 +0,0 @@
-# infrastructure/longhorn/adopt/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - longhorn-adopt-rbac.yaml
-  - longhorn-helm-adopt-job.yaml
-
-configMapGenerator:
-  - name: longhorn-helm-adopt-script
-    namespace: longhorn-system
-    files:
-      - longhorn_helm_adopt.sh=scripts/longhorn_helm_adopt.sh
-    options:
-      disableNameSuffixHash: true
--- a/infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
+++ b/infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
@ -1,56 +0,0 @@
-# infrastructure/longhorn/adopt/longhorn-adopt-rbac.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: longhorn-helm-adopt
-  namespace: longhorn-system
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: longhorn-helm-adopt
-rules:
-  - apiGroups: [""]
-    resources:
-      - configmaps
-      - services
-      - serviceaccounts
-      - secrets
-    verbs: ["get", "list", "watch", "patch", "update"]
-  - apiGroups: ["apps"]
-    resources:
-      - deployments
-      - daemonsets
-    verbs: ["get", "list", "watch", "patch", "update"]
-  - apiGroups: ["batch"]
-    resources:
-      - jobs
-    verbs: ["get", "list", "watch", "patch", "update"]
-  - apiGroups: ["rbac.authorization.k8s.io"]
-    resources:
-      - roles
-      - rolebindings
-      - clusterroles
-      - clusterrolebindings
-    verbs: ["get", "list", "watch", "patch", "update"]
-  - apiGroups: ["apiextensions.k8s.io"]
-    resources:
-      - customresourcedefinitions
-    verbs: ["get", "list", "watch", "patch", "update"]
-  - apiGroups: ["scheduling.k8s.io"]
-    resources:
-      - priorityclasses
-    verbs: ["get", "list", "watch", "patch", "update"]
---
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: longhorn-helm-adopt
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: longhorn-helm-adopt
-subjects:
-  - kind: ServiceAccount
-    name: longhorn-helm-adopt
-    namespace: longhorn-system
--- a/infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
+++ b/infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
@ -1,40 +0,0 @@
-# infrastructure/longhorn/adopt/longhorn-helm-adopt-job.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: longhorn-helm-adopt-2
-  namespace: longhorn-system
-spec:
-  backoffLimit: 1
-  template:
-    spec:
-      serviceAccountName: longhorn-helm-adopt
-      restartPolicy: Never
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: node-role.kubernetes.io/worker
-                    operator: Exists
-          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 100
-              preference:
-                matchExpressions:
-                  - key: kubernetes.io/arch
-                    operator: In
-                    values: ["arm64"]
-      containers:
-        - name: adopt
-          image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
-          command: ["/usr/bin/env", "bash"]
-          args: ["/scripts/longhorn_helm_adopt.sh"]
-          volumeMounts:
-            - name: script
-              mountPath: /scripts
-              readOnly: true
-      volumes:
-        - name: script
-          configMap:
-            name: longhorn-helm-adopt-script
-            defaultMode: 0555
--- a/infrastructure/longhorn/adopt/namespace.yaml
+++ b/infrastructure/longhorn/adopt/namespace.yaml
@ -1,5 +0,0 @@
-# infrastructure/longhorn/adopt/namespace.yaml
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: longhorn-system
--- a/infrastructure/longhorn/adopt/scripts/longhorn_helm_adopt.sh
+++ b/infrastructure/longhorn/adopt/scripts/longhorn_helm_adopt.sh
@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-release_name="longhorn"
-release_namespace="longhorn-system"
-selector="app.kubernetes.io/instance=${release_name}"
-
-annotate_and_label() {
-  local scope="$1"
-  local kind="$2"
-  if [ "${scope}" = "namespaced" ]; then
-    kubectl -n "${release_namespace}" annotate "${kind}" -l "${selector}" \
-      meta.helm.sh/release-name="${release_name}" \
-      meta.helm.sh/release-namespace="${release_namespace}" \
-      --overwrite >/dev/null 2>&1 || true
-    kubectl -n "${release_namespace}" label "${kind}" -l "${selector}" \
-      app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
-  else
-    kubectl annotate "${kind}" -l "${selector}" \
-      meta.helm.sh/release-name="${release_name}" \
-      meta.helm.sh/release-namespace="${release_namespace}" \
-      --overwrite >/dev/null 2>&1 || true
-    kubectl label "${kind}" -l "${selector}" \
-      app.kubernetes.io/managed-by=Helm --overwrite >/dev/null 2>&1 || true
-  fi
-}
-
-namespaced_kinds=(
-  configmap
-  service
-  serviceaccount
-  deployment
-  daemonset
-  job
-  role
-  rolebinding
-)
-
-cluster_kinds=(
-  clusterrole
-  clusterrolebinding
-  customresourcedefinition
-  priorityclass
-)
-
-for kind in "${namespaced_kinds[@]}"; do
-  annotate_and_label "namespaced" "${kind}"
-done
-
-for kind in "${cluster_kinds[@]}"; do
-  annotate_and_label "cluster" "${kind}"
-done
--- a/infrastructure/longhorn/core/helmrelease.yaml
+++ b/infrastructure/longhorn/core/helmrelease.yaml
@ -1,80 +0,0 @@
-# infrastructure/longhorn/core/helmrelease.yaml
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
-  name: longhorn
-  namespace: longhorn-system
-spec:
-  interval: 30m
-  chart:
-    spec:
-      chart: longhorn
-      version: 1.8.2
-      sourceRef:
-        kind: HelmRepository
-        name: longhorn
-        namespace: flux-system
-  install:
-    crds: Skip
-    remediation: { retries: 3 }
-    timeout: 15m
-  upgrade:
-    crds: Skip
-    remediation:
-      retries: 3
-      remediateLastFailure: true
-    cleanupOnFail: true
-    timeout: 15m
-  values:
-    service:
-      ui:
-        type: NodePort
-        nodePort: 30824
-    privateRegistry:
-      createSecret: false
-      registrySecret: longhorn-registry
-    image:
-      pullPolicy: Always
-      longhorn:
-        engine:
-          repository: registry.bstein.dev/infra/longhorn-engine
-          tag: v1.8.2
-        manager:
-          repository: registry.bstein.dev/infra/longhorn-manager
-          tag: v1.8.2
-        ui:
-          repository: registry.bstein.dev/infra/longhorn-ui
-          tag: v1.8.2
-        instanceManager:
-          repository: registry.bstein.dev/infra/longhorn-instance-manager
-          tag: v1.8.2
-        shareManager:
-          repository: registry.bstein.dev/infra/longhorn-share-manager
-          tag: v1.8.2
-        backingImageManager:
-          repository: registry.bstein.dev/infra/longhorn-backing-image-manager
-          tag: v1.8.2
-        supportBundleKit:
-          repository: registry.bstein.dev/infra/longhorn-support-bundle-kit
-          tag: v0.0.56
-      csi:
-        attacher:
-          repository: registry.bstein.dev/infra/longhorn-csi-attacher
-          tag: v4.9.0
-        provisioner:
-          repository: registry.bstein.dev/infra/longhorn-csi-provisioner
-          tag: v5.3.0
-        nodeDriverRegistrar:
-          repository: registry.bstein.dev/infra/longhorn-csi-node-driver-registrar
-          tag: v2.14.0
-        resizer:
-          repository: registry.bstein.dev/infra/longhorn-csi-resizer
-          tag: v1.13.2
-        snapshotter:
-          repository: registry.bstein.dev/infra/longhorn-csi-snapshotter
-          tag: v8.2.0
-        livenessProbe:
-          repository: registry.bstein.dev/infra/longhorn-livenessprobe
-          tag: v2.16.0
-    defaultSettings:
-      systemManagedPodsImagePullPolicy: Always
--- a/infrastructure/longhorn/core/kustomization.yaml
+++ b/infrastructure/longhorn/core/kustomization.yaml
@ -1,18 +0,0 @@
-# infrastructure/longhorn/core/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - vault-serviceaccount.yaml
-  - secretproviderclass.yaml
-  - vault-sync-deployment.yaml
-  - helmrelease.yaml
-  - longhorn-settings-ensure-job.yaml
-
-configMapGenerator:
-  - name: longhorn-settings-ensure-script
-    files:
-      - longhorn_settings_ensure.sh=scripts/longhorn_settings_ensure.sh
-
-generatorOptions:
-  disableNameSuffixHash: true
--- a/infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
+++ b/infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
@ -1,36 +0,0 @@
-# infrastructure/longhorn/core/longhorn-settings-ensure-job.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: longhorn-settings-ensure-4
-  namespace: longhorn-system
-spec:
-  backoffLimit: 0
-  ttlSecondsAfterFinished: 3600
-  template:
-    spec:
-      serviceAccountName: longhorn-service-account
-      restartPolicy: Never
-      volumes:
-        - name: longhorn-settings-ensure-script
-          configMap:
-            name: longhorn-settings-ensure-script
-            defaultMode: 0555
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: kubernetes.io/arch
-                    operator: In
-                    values: ["arm64"]
-                  - key: node-role.kubernetes.io/worker
-                    operator: Exists
-      containers:
-        - name: apply
-          image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
-          command: ["/scripts/longhorn_settings_ensure.sh"]
-          volumeMounts:
-            - name: longhorn-settings-ensure-script
-              mountPath: /scripts
-              readOnly: true
--- a/infrastructure/longhorn/core/namespace.yaml
+++ b/infrastructure/longhorn/core/namespace.yaml
@ -1,5 +0,0 @@
-# infrastructure/longhorn/core/namespace.yaml
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: longhorn-system
--- a/infrastructure/longhorn/core/scripts/longhorn_settings_ensure.sh
+++ b/infrastructure/longhorn/core/scripts/longhorn_settings_ensure.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env sh
-set -eu
-
-# Longhorn blocks direct CR patches for some settings; use the internal API instead.
-
-api_base="http://longhorn-backend.longhorn-system.svc:9500/v1/settings"
-
-wait_for_api() {
-  attempts=30
-  while [ "${attempts}" -gt 0 ]; do
-    if curl -fsS "${api_base}" >/dev/null 2>&1; then
-      return 0
-    fi
-    attempts=$((attempts - 1))
-    sleep 2
-  done
-  echo "Longhorn API not ready after retries." >&2
-  return 1
-}
-
-update_setting() {
-  name="$1"
-  value="$2"
-
-  current="$(curl -fsS "${api_base}/${name}" || true)"
-  if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
-    echo "Setting ${name} already set."
-    return 0
-  fi
-
-  echo "Setting ${name} -> ${value}"
-  curl -fsS -X PUT \
-    -H "Content-Type: application/json" \
-    -d "{\"value\":\"${value}\"}" \
-    "${api_base}/${name}" >/dev/null
-}
-
-wait_for_api
-update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v1.8.2"
-update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
-update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
-update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
--- a/infrastructure/longhorn/core/secretproviderclass.yaml
+++ b/infrastructure/longhorn/core/secretproviderclass.yaml
@ -1,21 +0,0 @@
-# infrastructure/longhorn/core/secretproviderclass.yaml
-apiVersion: secrets-store.csi.x-k8s.io/v1
-kind: SecretProviderClass
-metadata:
-  name: longhorn-vault
-  namespace: longhorn-system
-spec:
-  provider: vault
-  parameters:
-    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
-    roleName: "longhorn"
-    objects: |
-      - objectName: "harbor-pull__dockerconfigjson"
-        secretPath: "kv/data/atlas/shared/harbor-pull"
-        secretKey: "dockerconfigjson"
-  secretObjects:
-    - secretName: longhorn-registry
-      type: kubernetes.io/dockerconfigjson
-      data:
-        - objectName: harbor-pull__dockerconfigjson
-          key: .dockerconfigjson
--- a/infrastructure/longhorn/core/vault-serviceaccount.yaml
+++ b/infrastructure/longhorn/core/vault-serviceaccount.yaml
@ -1,6 +0,0 @@
-# infrastructure/longhorn/core/vault-serviceaccount.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: longhorn-vault-sync
-  namespace: longhorn-system
--- a/infrastructure/longhorn/core/vault-sync-deployment.yaml
+++ b/infrastructure/longhorn/core/vault-sync-deployment.yaml
@ -1,45 +0,0 @@
-# infrastructure/longhorn/core/vault-sync-deployment.yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: longhorn-vault-sync
-  namespace: longhorn-system
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: longhorn-vault-sync
-  template:
-    metadata:
-      labels:
-        app: longhorn-vault-sync
-    spec:
-      serviceAccountName: longhorn-vault-sync
-      nodeSelector:
-        node-role.kubernetes.io/worker: "true"
-      affinity:
-        nodeAffinity:
-          preferredDuringSchedulingIgnoredDuringExecution:
-            - weight: 80
-              preference:
-                matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values: ["rpi5", "rpi4"]
-      containers:
-        - name: sync
-          image: alpine:3.20
-          command: ["/bin/sh", "-c"]
-          args:
-            - "sleep infinity"
-          volumeMounts:
-            - name: vault-secrets
-              mountPath: /vault/secrets
-              readOnly: true
-      volumes:
-        - name: vault-secrets
-          csi:
-            driver: secrets-store.csi.k8s.io
-            readOnly: true
-            volumeAttributes:
-              secretProviderClass: longhorn-vault
--- a/infrastructure/longhorn/ui-ingress/kustomization.yaml
+++ b/infrastructure/longhorn/ui-ingress/kustomization.yaml
@ -2,7 +2,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
-  - serviceaccount.yaml
-  - oauth2-proxy-longhorn.yaml
  - middleware.yaml
  - ingress.yaml
+  - oauth2-proxy-longhorn.yaml
--- a/infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml
+++ b/infrastructure/longhorn/ui-ingress/oauth2-proxy-longhorn.yaml
@ -32,18 +32,7 @@ spec:
    metadata:
      labels:
        app: oauth2-proxy-longhorn
-      annotations:
-        vault.hashicorp.com/agent-inject: "true"
-        vault.hashicorp.com/role: "longhorn"
-        vault.hashicorp.com/agent-inject-secret-oidc-config: "kv/data/atlas/longhorn/oauth2-proxy"
-        vault.hashicorp.com/agent-inject-template-oidc-config: |
-          {{- with secret "kv/data/atlas/longhorn/oauth2-proxy" -}}
-          client_id = "{{ .Data.data.client_id }}"
-          client_secret = "{{ .Data.data.client_secret }}"
-          cookie_secret = "{{ .Data.data.cookie_secret }}"
-          {{- end -}}
    spec:
-      serviceAccountName: longhorn-vault
      nodeSelector:
        node-role.kubernetes.io/worker: "true"
      affinity:
@ -61,7 +50,6 @@ spec:
          imagePullPolicy: IfNotPresent
          args:
            - --provider=oidc
-            - --config=/vault/secrets/oidc-config
            - --redirect-url=https://longhorn.bstein.dev/oauth2/callback
            - --oidc-issuer-url=https://sso.bstein.dev/realms/atlas
            - --scope=openid profile email groups
@ -81,6 +69,22 @@ spec:
            - --skip-jwt-bearer-tokens=true
            - --oidc-groups-claim=groups
            - --cookie-domain=longhorn.bstein.dev
+          env:
+            - name: OAUTH2_PROXY_CLIENT_ID
+              valueFrom:
+                secretKeyRef:
+                  name: oauth2-proxy-longhorn-oidc
+                  key: client_id
+            - name: OAUTH2_PROXY_CLIENT_SECRET
+              valueFrom:
+                secretKeyRef:
+                  name: oauth2-proxy-longhorn-oidc
+                  key: client_secret
+            - name: OAUTH2_PROXY_COOKIE_SECRET
+              valueFrom:
+                secretKeyRef:
+                  name: oauth2-proxy-longhorn-oidc
+                  key: cookie_secret
          ports:
            - containerPort: 4180
              name: http
--- a/infrastructure/longhorn/ui-ingress/serviceaccount.yaml
+++ b/infrastructure/longhorn/ui-ingress/serviceaccount.yaml
@ -1,6 +0,0 @@
-# infrastructure/longhorn/ui-ingress/serviceaccount.yaml
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: longhorn-vault
-  namespace: longhorn-system
--- a/infrastructure/metallb/helmrelease.yaml
+++ b/infrastructure/metallb/helmrelease.yaml
@ -1,47 +0,0 @@
-# infrastructure/metallb/helmrelease.yaml
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
-  name: metallb
-  namespace: metallb-system
-spec:
-  interval: 30m
-  chart:
-    spec:
-      chart: metallb
-      version: 0.15.3
-      sourceRef:
-        kind: HelmRepository
-        name: metallb
-        namespace: flux-system
-  install:
-    crds: CreateReplace
-    remediation: { retries: 3 }
-    timeout: 10m
-  upgrade:
-    crds: CreateReplace
-    remediation:
-      retries: 3
-      remediateLastFailure: true
-    cleanupOnFail: true
-    timeout: 10m
-  values:
-    loadBalancerClass: metallb
-    prometheus:
-      metricsPort: 7472
-    controller:
-      logLevel: info
-      webhookMode: enabled
-      tlsMinVersion: VersionTLS12
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-              - matchExpressions:
-                  - key: hardware
-                    operator: In
-                    values:
-                      - rpi4
-                      - rpi5
-    speaker:
-      logLevel: info
--- a/infrastructure/metallb/kustomization.yaml
+++ b/infrastructure/metallb/kustomization.yaml
@ -3,5 +3,8 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
-  - helmrelease.yaml
+  - metallb-rendered.yaml
  - ippool.yaml
+patchesStrategicMerge:
+  - patches/node-placement.yaml
+  - patches/speaker-loglevel.yaml
--- a/infrastructure/metallb/metallb-rendered.yaml
+++ b/infrastructure/metallb/metallb-rendered.yaml
--- a/infrastructure/metallb/patches/node-placement.yaml
+++ b/infrastructure/metallb/patches/node-placement.yaml
@ -0,0 +1,27 @@
+# infrastructure/metallb/patches/node-placement.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: metallb-controller
+  namespace: metallb-system
+spec:
+  template:
+    spec:
+      containers:
+        - name: controller
+          args:
+            - --port=7472
+            - --log-level=info
+            - --webhook-mode=enabled
+            - --tls-min-version=VersionTLS12
+            - --lb-class=metallb
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values:
+                      - rpi4
+                      - rpi5
--- a/infrastructure/metallb/patches/speaker-loglevel.yaml
+++ b/infrastructure/metallb/patches/speaker-loglevel.yaml
@ -0,0 +1,15 @@
+# infrastructure/metallb/patches/speaker-loglevel.yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: metallb-speaker
+  namespace: metallb-system
+spec:
+  template:
+    spec:
+      containers:
+        - name: speaker
+          args:
+            - --port=7472
+            - --log-level=info
+            - --lb-class=metallb
--- a/infrastructure/modules/base/storageclass/asteria-encrypted.yaml
+++ b/infrastructure/modules/base/storageclass/asteria-encrypted.yaml
@ -1,24 +0,0 @@
-# infrastructure/modules/base/storageclass/asteria-encrypted.yaml
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: asteria-encrypted
-parameters:
-  diskSelector: asteria
-  fromBackup: ""
-  numberOfReplicas: "2"
-  staleReplicaTimeout: "30"
-  fsType: "ext4"
-  replicaAutoBalance: "least-effort"
-  dataLocality: "disabled"
-  encrypted: "true"
-  csi.storage.k8s.io/provisioner-secret-name: ${pvc.name}
-  csi.storage.k8s.io/provisioner-secret-namespace: ${pvc.namespace}
-  csi.storage.k8s.io/node-publish-secret-name: ${pvc.name}
-  csi.storage.k8s.io/node-publish-secret-namespace: ${pvc.namespace}
-  csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}
-  csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace}
-provisioner: driver.longhorn.io
-reclaimPolicy: Retain
-allowVolumeExpansion: true
-volumeBindingMode: Immediate
--- a/infrastructure/modules/base/storageclass/kustomization.yaml
+++ b/infrastructure/modules/base/storageclass/kustomization.yaml
@ -3,5 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - asteria.yaml
-  - asteria-encrypted.yaml
  - astreae.yaml
--- a/infrastructure/postgres/secretproviderclass.yaml
+++ b/infrastructure/postgres/secretproviderclass.yaml
@ -11,5 +11,5 @@ spec:
    roleName: "postgres"
    objects: |
      - objectName: "postgres_password"
-        secretPath: "kv/data/atlas/postgres/postgres-db"
+        secretPath: "kv/data/postgres"
        secretKey: "POSTGRES_PASSWORD"
--- a/infrastructure/postgres/service.yaml
+++ b/infrastructure/postgres/service.yaml
@ -4,10 +4,6 @@ kind: Service
 metadata:
  name: postgres-service
  namespace: postgres
-  annotations:
-    prometheus.io/scrape: "true"
-    prometheus.io/port: "9187"
-    prometheus.io/path: "/metrics"
 spec:
  clusterIP: None
  ports:
@ -15,9 +11,5 @@ spec:
      port: 5432
      protocol: TCP
      targetPort: 5432
-    - name: metrics
-      port: 9187
-      protocol: TCP
-      targetPort: 9187
  selector:
    app: postgres
--- a/infrastructure/postgres/statefulset.yaml
+++ b/infrastructure/postgres/statefulset.yaml
@ -58,23 +58,6 @@ spec:
            - name: vault-secrets
              mountPath: /mnt/vault
              readOnly: true
-        - name: postgres-exporter
-          image: quay.io/prometheuscommunity/postgres-exporter:v0.15.0
-          ports:
-            - name: metrics
-              containerPort: 9187
-              protocol: TCP
-          env:
-            - name: DATA_SOURCE_URI
-              value: "localhost:5432/postgres?sslmode=disable"
-            - name: DATA_SOURCE_USER
-              value: postgres
-            - name: DATA_SOURCE_PASS_FILE
-              value: /mnt/vault/postgres_password
-          volumeMounts:
-            - name: vault-secrets
-              mountPath: /mnt/vault
-              readOnly: true
      volumes:
        - name: vault-secrets
          csi:
--- a/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
+++ b/infrastructure/sources/cert-manager/letsencrypt-prod.yaml
@ -1,11 +1,10 @@
-# infrastructure/sources/cert-manager/letsencrypt-prod.yaml
 apiVersion: cert-manager.io/v1
 kind: ClusterIssuer
 metadata:
  name: letsencrypt-prod
 spec:
  acme:
-    email: brad@bstein.dev
+    email: brad.stein@gmail.com
    server: https://acme-v02.api.letsencrypt.org/directory
    privateKeySecretRef:
      name: letsencrypt-prod-account-key
--- a/infrastructure/sources/cert-manager/letsencrypt.yaml
+++ b/infrastructure/sources/cert-manager/letsencrypt.yaml
@ -1,11 +1,10 @@
-# infrastructure/sources/cert-manager/letsencrypt.yaml
 apiVersion: cert-manager.io/v1
 kind: ClusterIssuer
 metadata:
  name: letsencrypt
 spec:
  acme:
-    email: brad@bstein.dev
+    email: brad.stein@gmail.com
    server: https://acme-v02.api.letsencrypt.org/directory
    privateKeySecretRef:
      name: letsencrypt-account-key
--- a/infrastructure/sources/helm/ananace.yaml
+++ b/infrastructure/sources/helm/ananace.yaml
@ -1,9 +0,0 @@
-# infrastructure/sources/helm/ananace.yaml
-apiVersion: source.toolkit.fluxcd.io/v1
-kind: HelmRepository
-metadata:
-  name: ananace
-  namespace: flux-system
-spec:
-  interval: 1h
-  url: https://ananace.gitlab.io/charts
--- a/infrastructure/sources/helm/kustomization.yaml
+++ b/infrastructure/sources/helm/kustomization.yaml
@ -2,18 +2,15 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
-  - ananace.yaml
  - fluent-bit.yaml
  - grafana.yaml
  - hashicorp.yaml
  - jetstack.yaml
  - jenkins.yaml
  - mailu.yaml
-  - metallb.yaml
  - opentelemetry.yaml
  - opensearch.yaml
  - harbor.yaml
-  - longhorn.yaml
  - prometheus.yaml
  - victoria-metrics.yaml
  - secrets-store-csi.yaml
--- a/infrastructure/sources/helm/longhorn.yaml
+++ b/infrastructure/sources/helm/longhorn.yaml
@ -1,9 +0,0 @@
-# infrastructure/sources/helm/longhorn.yaml
-apiVersion: source.toolkit.fluxcd.io/v1
-kind: HelmRepository
-metadata:
-  name: longhorn
-  namespace: flux-system
-spec:
-  interval: 30m
-  url: https://charts.longhorn.io
--- a/infrastructure/sources/helm/metallb.yaml
+++ b/infrastructure/sources/helm/metallb.yaml
@ -1,9 +0,0 @@
-# infrastructure/sources/helm/metallb.yaml
-apiVersion: source.toolkit.fluxcd.io/v1
-kind: HelmRepository
-metadata:
-  name: metallb
-  namespace: flux-system
-spec:
-  interval: 1h
-  url: https://metallb.github.io/metallb
--- a/infrastructure/traefik/crds.yaml
+++ b/infrastructure/traefik/crds.yaml
--- a/infrastructure/traefik/deployment.yaml
+++ b/infrastructure/traefik/deployment.yaml
@ -27,8 +27,6 @@ items:
        creationTimestamp: null
        labels:
          app: traefik
-          app.kubernetes.io/instance: traefik-kube-system
-          app.kubernetes.io/name: traefik
      spec:
        containers:
        - args:
--- a/infrastructure/traefik/kustomization.yaml
+++ b/infrastructure/traefik/kustomization.yaml
@ -5,7 +5,6 @@ metadata:
  name: traefik
  namespace: flux-system
 resources:
-  - crds.yaml
  - deployment.yaml
  - serviceaccount.yaml
  - clusterrole.yaml
--- a/infrastructure/traefik/traefik-service-lb.yaml
+++ b/infrastructure/traefik/traefik-service-lb.yaml
@ -3,10 +3,9 @@ apiVersion: v1
 kind: Service
 metadata:
  name: traefik
-  namespace: traefik
+  namespace: kube-system
  annotations:
    metallb.universe.tf/address-pool: communication-pool
-    metallb.universe.tf/allow-shared-ip: traefik
 spec:
  type: LoadBalancer
  loadBalancerClass: metallb
@ -21,4 +20,5 @@ spec:
      targetPort: websecure
      protocol: TCP
  selector:
-    app: traefik
+    app.kubernetes.io/instance: traefik-kube-system
+    app.kubernetes.io/name: traefik
--- a/infrastructure/vault-csi/secrets-store-csi-driver.yaml
+++ b/infrastructure/vault-csi/secrets-store-csi-driver.yaml
@ -17,5 +17,4 @@ spec:
  values:
    syncSecret:
      enabled: true
-    enableSecretRotation: true
-    rotationPollInterval: 2m
+    enableSecretRotation: false
--- a/infrastructure/vault-injector/helmrelease.yaml
+++ b/infrastructure/vault-injector/helmrelease.yaml
@ -1,43 +0,0 @@
-# infrastructure/vault-injector/helmrelease.yaml
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
-  name: vault-injector
-  namespace: vault
-spec:
-  interval: 30m
-  chart:
-    spec:
-      chart: vault
-      version: 0.31.0
-      sourceRef:
-        kind: HelmRepository
-        name: hashicorp
-        namespace: flux-system
-  install:
-    remediation: { retries: 3 }
-    timeout: 10m
-  upgrade:
-    remediation:
-      retries: 3
-      remediateLastFailure: true
-    cleanupOnFail: true
-    timeout: 10m
-  values:
-    global:
-      externalVaultAddr: http://vault.vault.svc.cluster.local:8200
-      tlsDisable: true
-    server:
-      enabled: false
-    csi:
-      enabled: false
-    injector:
-      enabled: true
-      replicas: 1
-      agentImage:
-        repository: hashicorp/vault
-        tag: "1.17.6"
-      webhook:
-        failurePolicy: Ignore
-      nodeSelector:
-        node-role.kubernetes.io/worker: "true"
--- a/infrastructure/vault-injector/kustomization.yaml
+++ b/infrastructure/vault-injector/kustomization.yaml
@ -1,5 +0,0 @@
-# infrastructure/vault-injector/kustomization.yaml
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - helmrelease.yaml
--- a/knowledge/catalog/atlas-summary.json
+++ b/knowledge/catalog/atlas-summary.json
@ -1,8 +1,8 @@
 {
  "counts": {
-    "helmrelease_host_hints": 19,
-    "http_endpoints": 45,
-    "services": 47,
-    "workloads": 74
+    "helmrelease_host_hints": 7,
+    "http_endpoints": 35,
+    "services": 44,
+    "workloads": 49
  }
 }
--- a/knowledge/catalog/atlas.json
+++ b/knowledge/catalog/atlas.json
--- a/knowledge/catalog/atlas.yaml
+++ b/knowledge/catalog/atlas.yaml
--- a/knowledge/catalog/metrics.json
+++ b/knowledge/catalog/metrics.json
--- a/knowledge/catalog/runbooks.json
+++ b/knowledge/catalog/runbooks.json
--- a/knowledge/diagrams/atlas-http.mmd
+++ b/knowledge/diagrams/atlas-http.mmd
@ -17,11 +17,6 @@ flowchart LR
  host_bstein_dev --> svc_bstein_dev_home_bstein_dev_home_backend
  wl_bstein_dev_home_bstein_dev_home_backend["bstein-dev-home/bstein-dev-home-backend (Deployment)"]
  svc_bstein_dev_home_bstein_dev_home_backend --> wl_bstein_dev_home_bstein_dev_home_backend
-  host_budget_bstein_dev["budget.bstein.dev"]
-  svc_finance_actual_budget["finance/actual-budget (Service)"]
-  host_budget_bstein_dev --> svc_finance_actual_budget
-  wl_finance_actual_budget["finance/actual-budget (Deployment)"]
-  svc_finance_actual_budget --> wl_finance_actual_budget
  host_call_live_bstein_dev["call.live.bstein.dev"]
  svc_comms_element_call["comms/element-call (Service)"]
  host_call_live_bstein_dev --> svc_comms_element_call
@ -42,11 +37,6 @@ flowchart LR
  host_cloud_bstein_dev --> svc_nextcloud_nextcloud
  wl_nextcloud_nextcloud["nextcloud/nextcloud (Deployment)"]
  svc_nextcloud_nextcloud --> wl_nextcloud_nextcloud
-  host_health_bstein_dev["health.bstein.dev"]
-  svc_health_wger["health/wger (Service)"]
-  host_health_bstein_dev --> svc_health_wger
-  wl_health_wger["health/wger (Deployment)"]
-  svc_health_wger --> wl_health_wger
  host_kit_live_bstein_dev["kit.live.bstein.dev"]
  svc_comms_livekit_token_service["comms/livekit-token-service (Service)"]
  host_kit_live_bstein_dev --> svc_comms_livekit_token_service
@ -57,22 +47,15 @@ flowchart LR
  wl_comms_livekit["comms/livekit (Deployment)"]
  svc_comms_livekit --> wl_comms_livekit
  host_live_bstein_dev["live.bstein.dev"]
+  svc_comms_othrys_element_element_web["comms/othrys-element-element-web (Service)"]
+  host_live_bstein_dev --> svc_comms_othrys_element_element_web
+  wl_comms_othrys_element_element_web["comms/othrys-element-element-web (Deployment)"]
+  svc_comms_othrys_element_element_web --> wl_comms_othrys_element_element_web
  host_live_bstein_dev --> svc_comms_matrix_wellknown
  svc_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Service)"]
  host_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
-  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
-  host_live_bstein_dev --> svc_comms_matrix_guest_register
-  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
-  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
-  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
-  host_live_bstein_dev --> svc_comms_matrix_authentication_service
-  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
-  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
-  host_logs_bstein_dev["logs.bstein.dev"]
-  svc_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Service)"]
-  host_logs_bstein_dev --> svc_logging_oauth2_proxy_logs
-  wl_logging_oauth2_proxy_logs["logging/oauth2-proxy-logs (Deployment)"]
-  svc_logging_oauth2_proxy_logs --> wl_logging_oauth2_proxy_logs
+  wl_comms_othrys_synapse_matrix_synapse["comms/othrys-synapse-matrix-synapse (Deployment)"]
+  svc_comms_othrys_synapse_matrix_synapse --> wl_comms_othrys_synapse_matrix_synapse
  host_longhorn_bstein_dev["longhorn.bstein.dev"]
  svc_longhorn_system_oauth2_proxy_longhorn["longhorn-system/oauth2-proxy-longhorn (Service)"]
  host_longhorn_bstein_dev --> svc_longhorn_system_oauth2_proxy_longhorn
@ -82,25 +65,21 @@ flowchart LR
  svc_mailu_mailserver_mailu_front["mailu-mailserver/mailu-front (Service)"]
  host_mail_bstein_dev --> svc_mailu_mailserver_mailu_front
  host_matrix_live_bstein_dev["matrix.live.bstein.dev"]
+  svc_comms_matrix_authentication_service["comms/matrix-authentication-service (Service)"]
  host_matrix_live_bstein_dev --> svc_comms_matrix_authentication_service
+  wl_comms_matrix_authentication_service["comms/matrix-authentication-service (Deployment)"]
+  svc_comms_matrix_authentication_service --> wl_comms_matrix_authentication_service
  host_matrix_live_bstein_dev --> svc_comms_matrix_wellknown
  host_matrix_live_bstein_dev --> svc_comms_othrys_synapse_matrix_synapse
+  svc_comms_matrix_guest_register["comms/matrix-guest-register (Service)"]
  host_matrix_live_bstein_dev --> svc_comms_matrix_guest_register
+  wl_comms_matrix_guest_register["comms/matrix-guest-register (Deployment)"]
+  svc_comms_matrix_guest_register --> wl_comms_matrix_guest_register
  host_monero_bstein_dev["monero.bstein.dev"]
  svc_crypto_monerod["crypto/monerod (Service)"]
  host_monero_bstein_dev --> svc_crypto_monerod
  wl_crypto_monerod["crypto/monerod (Deployment)"]
  svc_crypto_monerod --> wl_crypto_monerod
-  host_money_bstein_dev["money.bstein.dev"]
-  svc_finance_firefly["finance/firefly (Service)"]
-  host_money_bstein_dev --> svc_finance_firefly
-  wl_finance_firefly["finance/firefly (Deployment)"]
-  svc_finance_firefly --> wl_finance_firefly
-  host_notes_bstein_dev["notes.bstein.dev"]
-  svc_outline_outline["outline/outline (Service)"]
-  host_notes_bstein_dev --> svc_outline_outline
-  wl_outline_outline["outline/outline (Deployment)"]
-  svc_outline_outline --> wl_outline_outline
  host_office_bstein_dev["office.bstein.dev"]
  svc_nextcloud_collabora["nextcloud/collabora (Service)"]
  host_office_bstein_dev --> svc_nextcloud_collabora
@ -131,11 +110,6 @@ flowchart LR
  host_stream_bstein_dev --> svc_jellyfin_jellyfin
  wl_jellyfin_jellyfin["jellyfin/jellyfin (Deployment)"]
  svc_jellyfin_jellyfin --> wl_jellyfin_jellyfin
-  host_tasks_bstein_dev["tasks.bstein.dev"]
-  svc_planka_planka["planka/planka (Service)"]
-  host_tasks_bstein_dev --> svc_planka_planka
-  wl_planka_planka["planka/planka (Deployment)"]
-  svc_planka_planka --> wl_planka_planka
  host_vault_bstein_dev["vault.bstein.dev"]
  svc_vaultwarden_vaultwarden_service["vaultwarden/vaultwarden-service (Service)"]
  host_vault_bstein_dev --> svc_vaultwarden_vaultwarden_service
@ -159,30 +133,23 @@ flowchart LR
    wl_comms_livekit_token_service
    svc_comms_livekit
    wl_comms_livekit
+    svc_comms_othrys_element_element_web
+    wl_comms_othrys_element_element_web
    svc_comms_othrys_synapse_matrix_synapse
-    svc_comms_matrix_guest_register
-    wl_comms_matrix_guest_register
+    wl_comms_othrys_synapse_matrix_synapse
    svc_comms_matrix_authentication_service
    wl_comms_matrix_authentication_service
+    svc_comms_matrix_guest_register
+    wl_comms_matrix_guest_register
  end
  subgraph crypto[crypto]
    svc_crypto_monerod
    wl_crypto_monerod
  end
-  subgraph finance[finance]
-    svc_finance_actual_budget
-    wl_finance_actual_budget
-    svc_finance_firefly
-    wl_finance_firefly
-  end
  subgraph gitea[gitea]
    svc_gitea_gitea
    wl_gitea_gitea
  end
-  subgraph health[health]
-    svc_health_wger
-    wl_health_wger
-  end
  subgraph jellyfin[jellyfin]
    svc_jellyfin_pegasus
    wl_jellyfin_pegasus
@ -193,10 +160,6 @@ flowchart LR
    svc_jenkins_jenkins
    wl_jenkins_jenkins
  end
-  subgraph logging[logging]
-    svc_logging_oauth2_proxy_logs
-    wl_logging_oauth2_proxy_logs
-  end
  subgraph longhorn_system[longhorn-system]
    svc_longhorn_system_oauth2_proxy_longhorn
    wl_longhorn_system_oauth2_proxy_longhorn
@ -210,14 +173,6 @@ flowchart LR
    svc_nextcloud_collabora
    wl_nextcloud_collabora
  end
-  subgraph outline[outline]
-    svc_outline_outline
-    wl_outline_outline
-  end
-  subgraph planka[planka]
-    svc_planka_planka
-    wl_planka_planka
-  end
  subgraph sso[sso]
    svc_sso_oauth2_proxy
    wl_sso_oauth2_proxy
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -70,7 +70,6 @@ WORKER_NODES = [
    "titan-13",
    "titan-14",
    "titan-15",
-    "titan-16",
    "titan-17",
    "titan-18",
    "titan-19",
@ -86,17 +85,19 @@ WORKER_TOTAL = len(WORKER_NODES)
 CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
 WORKER_SUFFIX = f"/{WORKER_TOTAL}"
 # Namespaces considered infrastructure (excluded from workload counts)
-INFRA_PATTERNS = [
-    "kube-.*",
-    ".*-system",
-    "traefik",
+INFRA_NAMESPACES = [
+    "kube-system",
+    "longhorn-system",
+    "metallb-system",
    "monitoring",
    "logging",
    "cert-manager",
+    "flux-system",
+    "traefik",
    "maintenance",
    "postgres",
 ]
-INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
+INFRA_REGEX = f"^({'|'.join(INFRA_NAMESPACES)})$"
 # Namespaces allowed on control plane without counting as workloads
 CP_ALLOWED_NS = INFRA_REGEX
 LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
@ -208,66 +209,7 @@ def namespace_ram_raw(scope_var):


 def namespace_gpu_usage_instant(scope_var):
-    return gpu_usage_by_namespace(scope_var)
-
-
-def jetson_gpu_util_by_node():
-    return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
-
-
-def dcgm_gpu_util_by_node():
-    dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
-    dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
-    return (
-        "avg by (node) ("
-        f"{dcgm_ns} * on(namespace,pod) group_left(node) "
-        'kube_pod_info{namespace="monitoring"}'
-        ")"
-    )
-
-
-def gpu_util_by_node():
-    return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
-
-
-def gpu_util_by_hostname():
-    return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
-
-
-def gpu_node_labels():
-    return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
-
-
-def gpu_requests_by_namespace_node(scope_var):
-    return (
-        "sum by (namespace,node) ("
-        f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
-        "* on(namespace,pod) group_left(node) kube_pod_info "
-        f"* on(node) group_left() ({gpu_node_labels()})"
-        ")"
-    )
-
-
-def gpu_usage_by_namespace(scope_var):
-    requests_by_ns = gpu_requests_by_namespace_node(scope_var)
-    total_by_node = f"sum by (node) ({requests_by_ns})"
-    return (
-        "sum by (namespace) ("
-        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
-        f"* on(node) group_left() ({gpu_util_by_node()})"
-        ")"
-    )
-
-
-def jetson_gpu_usage_by_namespace(scope_var):
-    requests_by_ns = jetson_gpu_requests(scope_var)
-    total_by_node = f"sum by (node) ({requests_by_ns})"
-    return (
-        "sum by (namespace) ("
-        f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
-        f"* on(node) group_left() {jetson_gpu_util_by_node()}"
-        ")"
-    )
+    return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"


 def namespace_share_expr(resource_expr):
@ -287,7 +229,7 @@ def namespace_gpu_share_expr(scope_var):
    usage = namespace_gpu_usage_instant(scope_var)
    total = f"(sum({usage}) or on() vector(0))"
    share = f"100 * ({usage}) / clamp_min({total}, 1)"
-    idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
+    idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
    return f"({share}) or ({idle})"


@ -377,76 +319,6 @@ NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
 NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
 NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
 NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
-GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
-GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
-GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
-GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
-GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
-GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
-GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
-GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
-GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
-GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
-GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
-GLUE_STALE_WINDOW_SEC = 36 * 3600
-GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
-GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
-GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
-GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
-GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)"
-GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)"
-GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)"
-ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
-ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
-ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
-ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
-ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
-ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
-ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
-ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
-ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
-ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
-ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
-ARIADNE_TASK_WARNINGS_SERIES = (
-    'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
-)
-ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600"
-ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600"
-ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
-    "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600"
-)
-ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
-    "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
-)
-ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
-ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
-ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
-ARIADNE_TEST_SUCCESS_RATE = (
-    "100 * "
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
-    "/ clamp_min("
-    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
-)
-ARIADNE_TEST_FAILURES_24H = (
-    'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
-)
-POSTGRES_CONN_USED = (
-    'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
-    'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
-)
-POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
-ONEOFF_JOB_OWNER = (
-    'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
-)
-ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
-ONEOFF_JOB_POD_AGE_HOURS = (
-    '((time() - kube_pod_start_time{pod!=""}) / 3600) '
-    f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
-    '* on(namespace,pod) group_left(phase) '
-    'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
-)
-GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600"
-GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600"
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
@ -624,7 +496,6 @@ def timeseries_panel(
    grid,
    *,
    unit="none",
-    max_value=None,
    legend=None,
    legend_display="table",
    legend_placement="bottom",
@ -649,8 +520,6 @@ def timeseries_panel(
            "tooltip": {"mode": "multi"},
        },
    }
-    if max_value is not None:
-        panel["fieldConfig"]["defaults"]["max"] = max_value
    if legend:
        panel["targets"][0]["legendFormat"] = legend
    if legend_calcs:
@ -802,22 +671,13 @@ def bargauge_panel(
    grid,
    *,
    unit="none",
-    legend=None,
    links=None,
    limit=None,
-    sort_order="desc",
    thresholds=None,
    decimals=None,
    instant=False,
-    overrides=None,
 ):
    """Return a bar gauge panel with label-aware reduction."""
-    cleaned_expr = expr.strip()
-    if not cleaned_expr.startswith(("sort(", "sort_desc(")):
-        if sort_order == "desc":
-            expr = f"sort_desc({expr})"
-        elif sort_order == "asc":
-            expr = f"sort({expr})"
    panel = {
        "id": panel_id,
        "type": "bargauge",
@ -825,12 +685,7 @@ def bargauge_panel(
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [
-            {
-                "expr": expr,
-                "refId": "A",
-                "legendFormat": legend or "{{node}}",
-                **({"instant": True} if instant else {}),
-            }
+            {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
        ],
        "fieldConfig": {
            "defaults": {
@ -860,8 +715,6 @@ def bargauge_panel(
            },
        },
    }
-    if overrides:
-        panel["fieldConfig"]["overrides"].extend(overrides)
    if decimals is not None:
        panel["fieldConfig"]["defaults"]["decimals"] = decimals
    if links:
@ -870,7 +723,7 @@ def bargauge_panel(
    panel["transformations"] = [
        {
            "id": "sortBy",
-            "options": {"fields": ["Value"], "order": sort_order},
+            "options": {"fields": ["Value"], "order": "desc"},
        }
    ]
    if limit:
@ -910,15 +763,6 @@ def build_overview():
            {"color": "red", "value": 3},
        ],
    }
-    age_thresholds = {
-        "mode": "absolute",
-        "steps": [
-            {"color": "green", "value": None},
-            {"color": "yellow", "value": 6},
-            {"color": "orange", "value": 24},
-            {"color": "red", "value": 48},
-        ],
-    }

    row1_stats = [
        {
@ -1121,7 +965,7 @@ def build_overview():
            30,
            "Mail Sent (1d)",
            'max(postmark_outbound_sent{window="1d"})',
-            {"h": 3, "w": 4, "x": 0, "y": 8},
+            {"h": 2, "w": 6, "x": 0, "y": 8},
            unit="none",
            links=link_to("atlas-mail"),
        )
@ -1132,7 +976,7 @@ def build_overview():
            "type": "stat",
            "title": "Mail Bounces (1d)",
            "datasource": PROM_DS,
-            "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
+            "gridPos": {"h": 2, "w": 6, "x": 12, "y": 8},
            "targets": [
                {
                    "expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@ -1178,7 +1022,7 @@ def build_overview():
            32,
            "Mail Success Rate (1d)",
            'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
-            {"h": 3, "w": 4, "x": 4, "y": 8},
+            {"h": 2, "w": 6, "x": 6, "y": 8},
            unit="percent",
            thresholds=mail_success_thresholds,
            decimals=1,
@ -1190,38 +1034,13 @@ def build_overview():
            33,
            "Mail Limit Used (30d)",
            "max(postmark_sending_limit_used_percent)",
-            {"h": 3, "w": 4, "x": 12, "y": 8},
+            {"h": 2, "w": 6, "x": 18, "y": 8},
            unit="percent",
            thresholds=mail_limit_thresholds,
            decimals=1,
            links=link_to("atlas-mail"),
        )
    )
-    panels.append(
-        stat_panel(
-            34,
-            "Postgres Connections Used",
-            POSTGRES_CONN_USED,
-            {"h": 3, "w": 4, "x": 16, "y": 8},
-            decimals=0,
-            text_mode="name_and_value",
-            legend="{{conn}}",
-            instant=True,
-        )
-    )
-    panels.append(
-        stat_panel(
-            35,
-            "Postgres Hottest Connections",
-            POSTGRES_CONN_HOTTEST,
-            {"h": 3, "w": 4, "x": 20, "y": 8},
-            unit="none",
-            decimals=0,
-            text_mode="name_and_value",
-            legend="{{datname}}",
-            instant=True,
-        )
-    )

    storage_panels = [
        (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
@ -1235,104 +1054,13 @@ def build_overview():
                panel_id,
                title,
                expr,
-                {"h": 3, "w": 6, "x": 6 * idx, "y": 11},
+                {"h": 6, "w": 6, "x": 6 * idx, "y": 10},
                unit=unit,
                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                links=link_to("atlas-storage"),
            )
        )

-    panels.append(
-        bargauge_panel(
-            40,
-            "One-off Job Pods (age hours)",
-            ONEOFF_JOB_POD_AGE_HOURS,
-            {"h": 6, "w": 6, "x": 0, "y": 14},
-            unit="h",
-            instant=True,
-            legend="{{namespace}}/{{pod}}",
-            thresholds=age_thresholds,
-            limit=8,
-            decimals=2,
-        )
-    )
-    panels.append(
-        {
-            "id": 41,
-            "type": "timeseries",
-            "title": "Ariadne Attempts / Failures",
-            "datasource": PROM_DS,
-            "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
-            "targets": [
-                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
-                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
-            ],
-            "fieldConfig": {
-                "defaults": {"unit": "none"},
-                "overrides": [
-                    {
-                        "matcher": {"id": "byName", "options": "Attempts"},
-                        "properties": [
-                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
-                        ],
-                    },
-                    {
-                        "matcher": {"id": "byName", "options": "Failures"},
-                        "properties": [
-                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
-                        ],
-                    },
-                ],
-            },
-            "options": {
-                "legend": {"displayMode": "table", "placement": "right"},
-                "tooltip": {"mode": "multi"},
-            },
-        }
-    )
-    panels.append(
-        timeseries_panel(
-            42,
-            "Ariadne Test Success Rate",
-            ARIADNE_TEST_SUCCESS_RATE,
-            {"h": 6, "w": 6, "x": 12, "y": 14},
-            unit="percent",
-            max_value=100,
-            legend=None,
-            legend_display="list",
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            43,
-            "Tests with Failures (24h)",
-            ARIADNE_TEST_FAILURES_24H,
-            {"h": 6, "w": 6, "x": 18, "y": 14},
-            unit="none",
-            instant=True,
-            legend="{{result}}",
-            overrides=[
-                {
-                    "matcher": {"id": "byName", "options": "error"},
-                    "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
-                },
-                {
-                    "matcher": {"id": "byName", "options": "failed"},
-                    "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
-                },
-            ],
-            thresholds={
-                "mode": "absolute",
-                "steps": [
-                    {"color": "green", "value": None},
-                    {"color": "yellow", "value": 1},
-                    {"color": "orange", "value": 5},
-                    {"color": "red", "value": 10},
-                ],
-            },
-        )
-    )
-
    cpu_scope = "$namespace_scope_cpu"
    gpu_scope = "$namespace_scope_gpu"
    ram_scope = "$namespace_scope_ram"
@ -1342,9 +1070,9 @@ def build_overview():
            11,
            "Namespace CPU Share",
            namespace_cpu_share_expr(cpu_scope),
-            {"h": 9, "w": 8, "x": 0, "y": 20},
+            {"h": 9, "w": 8, "x": 0, "y": 16},
            links=namespace_scope_links("namespace_scope_cpu"),
-            description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
@ -1352,9 +1080,9 @@ def build_overview():
            12,
            "Namespace GPU Share",
            namespace_gpu_share_expr(gpu_scope),
-            {"h": 9, "w": 8, "x": 8, "y": 20},
+            {"h": 9, "w": 8, "x": 8, "y": 16},
            links=namespace_scope_links("namespace_scope_gpu"),
-            description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
@ -1362,9 +1090,9 @@ def build_overview():
            13,
            "Namespace RAM Share",
            namespace_ram_share_expr(ram_scope),
-            {"h": 9, "w": 8, "x": 16, "y": 20},
+            {"h": 9, "w": 8, "x": 16, "y": 16},
            links=namespace_scope_links("namespace_scope_ram"),
-            description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )

@ -1374,7 +1102,7 @@ def build_overview():
            14,
            "Worker Node CPU",
            node_cpu_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 0, "y": 36},
+            {"h": 12, "w": 12, "x": 0, "y": 32},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
@ -1388,7 +1116,7 @@ def build_overview():
            15,
            "Worker Node RAM",
            node_mem_expr(worker_filter),
-            {"h": 12, "w": 12, "x": 12, "y": 36},
+            {"h": 12, "w": 12, "x": 12, "y": 32},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
@ -1403,7 +1131,7 @@ def build_overview():
            16,
            "Control plane CPU",
            node_cpu_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 0, "y": 48},
+            {"h": 10, "w": 12, "x": 0, "y": 44},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
@ -1415,7 +1143,7 @@ def build_overview():
            17,
            "Control plane RAM",
            node_mem_expr(CONTROL_ALL_REGEX),
-            {"h": 10, "w": 12, "x": 12, "y": 48},
+            {"h": 10, "w": 12, "x": 12, "y": 44},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
@ -1428,7 +1156,7 @@ def build_overview():
            28,
            "Node Pod Share",
            '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
-            {"h": 10, "w": 12, "x": 0, "y": 58},
+            {"h": 10, "w": 12, "x": 0, "y": 54},
        )
    )
    panels.append(
@ -1436,7 +1164,7 @@ def build_overview():
            29,
            "Top Nodes by Pod Count",
            'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
-            {"h": 10, "w": 12, "x": 12, "y": 58},
+            {"h": 10, "w": 12, "x": 12, "y": 54},
            unit="none",
            limit=12,
            decimals=0,
@ -1458,7 +1186,7 @@ def build_overview():
            18,
            "Cluster Ingress Throughput",
            NET_INGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 0, "y": 29},
+            {"h": 7, "w": 8, "x": 0, "y": 25},
            unit="Bps",
            legend="Ingress (Traefik)",
            legend_display="list",
@ -1471,7 +1199,7 @@ def build_overview():
            19,
            "Cluster Egress Throughput",
            NET_EGRESS_EXPR,
-            {"h": 7, "w": 8, "x": 8, "y": 29},
+            {"h": 7, "w": 8, "x": 8, "y": 25},
            unit="Bps",
            legend="Egress (Traefik)",
            legend_display="list",
@ -1484,7 +1212,7 @@ def build_overview():
            20,
            "Intra-Cluster Throughput",
            NET_INTERNAL_EXPR,
-            {"h": 7, "w": 8, "x": 16, "y": 29},
+            {"h": 7, "w": 8, "x": 16, "y": 25},
            unit="Bps",
            legend="Internal traffic",
            legend_display="list",
@ -1498,7 +1226,7 @@ def build_overview():
            21,
            "Root Filesystem Usage",
            root_usage_expr(),
-            {"h": 16, "w": 12, "x": 0, "y": 68},
+            {"h": 16, "w": 12, "x": 0, "y": 64},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
@ -1513,7 +1241,7 @@ def build_overview():
            22,
            "Nodes Closest to Full Root Disks",
            f"topk(12, {root_usage_expr()})",
-            {"h": 16, "w": 12, "x": 12, "y": 68},
+            {"h": 16, "w": 12, "x": 12, "y": 64},
            unit="percent",
            thresholds=PERCENT_THRESHOLDS,
            links=link_to("atlas-storage"),
@ -1999,7 +1727,7 @@ def build_storage_dashboard():
        stat_panel(
            31,
            "Maintenance Cron Freshness (s)",
-            'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
+            'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob=~"image-sweeper|grafana-smtp-sync"})',
            {"h": 4, "w": 12, "x": 12, "y": 44},
            unit="s",
            thresholds={
@ -2408,285 +2136,6 @@ def build_mail_dashboard():
    }


-def build_jobs_dashboard():
-    panels = []
-    age_thresholds = {
-        "mode": "absolute",
-        "steps": [
-            {"color": "green", "value": None},
-            {"color": "yellow", "value": 6},
-            {"color": "orange", "value": 24},
-            {"color": "red", "value": 48},
-        ],
-    }
-    recent_error_thresholds = {
-        "mode": "absolute",
-        "steps": [
-            {"color": "red", "value": None},
-            {"color": "orange", "value": 1},
-            {"color": "yellow", "value": 6},
-            {"color": "green", "value": 24},
-        ],
-    }
-
-    task_error_thresholds = {
-        "mode": "absolute",
-        "steps": [
-            {"color": "green", "value": None},
-            {"color": "yellow", "value": 1},
-            {"color": "orange", "value": 3},
-            {"color": "red", "value": 5},
-        ],
-    }
-
-    panels.append(
-        bargauge_panel(
-            1,
-            "Ariadne Task Errors (range)",
-            ARIADNE_TASK_ERRORS_RANGE,
-            {"h": 7, "w": 8, "x": 0, "y": 0},
-            unit="none",
-            instant=True,
-            legend="{{task}}",
-            thresholds=task_error_thresholds,
-        )
-    )
-    panels.append(
-        {
-            "id": 2,
-            "type": "timeseries",
-            "title": "Ariadne Attempts / Failures",
-            "datasource": PROM_DS,
-            "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
-            "targets": [
-                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
-                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
-            ],
-            "fieldConfig": {
-                "defaults": {"unit": "none"},
-                "overrides": [
-                    {
-                        "matcher": {"id": "byName", "options": "Attempts"},
-                        "properties": [
-                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
-                        ],
-                    },
-                    {
-                        "matcher": {"id": "byName", "options": "Failures"},
-                        "properties": [
-                            {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
-                        ],
-                    },
-                ],
-            },
-            "options": {
-                "legend": {"displayMode": "table", "placement": "right"},
-                "tooltip": {"mode": "multi"},
-            },
-        }
-    )
-    panels.append(
-        bargauge_panel(
-            3,
-            "One-off Job Pods (age hours)",
-            ONEOFF_JOB_POD_AGE_HOURS,
-            {"h": 7, "w": 8, "x": 16, "y": 0},
-            unit="h",
-            instant=True,
-            legend="{{namespace}}/{{pod}}",
-            thresholds=age_thresholds,
-            limit=12,
-            decimals=2,
-        )
-    )
-    panels.append(
-        stat_panel(
-            4,
-            "Glue Jobs Stale (>36h)",
-            GLUE_STALE_COUNT,
-            {"h": 4, "w": 4, "x": 0, "y": 7},
-            unit="none",
-            thresholds={
-                "mode": "absolute",
-                "steps": [
-                    {"color": "green", "value": None},
-                    {"color": "yellow", "value": 1},
-                    {"color": "orange", "value": 2},
-                    {"color": "red", "value": 3},
-                ],
-            },
-        )
-    )
-    panels.append(
-        stat_panel(
-            5,
-            "Glue Jobs Missing Success",
-            GLUE_MISSING_COUNT,
-            {"h": 4, "w": 4, "x": 4, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            6,
-            "Glue Jobs Suspended",
-            GLUE_SUSPENDED_COUNT,
-            {"h": 4, "w": 4, "x": 8, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            7,
-            "Ariadne Task Errors (1h)",
-            ARIADNE_TASK_ERRORS_1H_TOTAL,
-            {"h": 4, "w": 4, "x": 12, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            8,
-            "Ariadne Task Errors (24h)",
-            ARIADNE_TASK_ERRORS_24H_TOTAL,
-            {"h": 4, "w": 4, "x": 16, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            9,
-            "Ariadne Task Runs (1h)",
-            ARIADNE_TASK_RUNS_1H_TOTAL,
-            {"h": 4, "w": 4, "x": 20, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            10,
-            "Ariadne Schedule Last Error (hours ago)",
-            ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 0, "y": 17},
-            unit="h",
-            instant=True,
-            legend="{{task}}",
-            thresholds=recent_error_thresholds,
-            decimals=2,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            11,
-            "Ariadne Schedule Last Success (hours ago)",
-            ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 12, "y": 17},
-            unit="h",
-            instant=True,
-            legend="{{task}}",
-            thresholds=age_thresholds,
-            decimals=2,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            12,
-            "Glue Jobs Last Success (hours ago)",
-            GLUE_LAST_SUCCESS_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 0, "y": 23},
-            unit="h",
-            instant=True,
-            legend="{{namespace}}/{{cronjob}}",
-            thresholds=age_thresholds,
-            decimals=2,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            13,
-            "Glue Jobs Last Schedule (hours ago)",
-            GLUE_LAST_SCHEDULE_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 12, "y": 23},
-            unit="h",
-            instant=True,
-            legend="{{namespace}}/{{cronjob}}",
-            thresholds=age_thresholds,
-            decimals=2,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            14,
-            "Ariadne Task Errors (1h)",
-            ARIADNE_TASK_ERRORS_1H,
-            {"h": 6, "w": 12, "x": 0, "y": 29},
-            unit="none",
-            instant=True,
-            legend="{{task}}",
-            thresholds=task_error_thresholds,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            15,
-            "Ariadne Task Errors (30d)",
-            ARIADNE_TASK_ERRORS_30D,
-            {"h": 6, "w": 12, "x": 12, "y": 29},
-            unit="none",
-            instant=True,
-            legend="{{task}}",
-            thresholds=task_error_thresholds,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            16,
-            "Ariadne Access Requests",
-            ARIADNE_ACCESS_REQUESTS,
-            {"h": 6, "w": 8, "x": 0, "y": 11},
-            unit="none",
-            instant=True,
-            legend="{{status}}",
-        )
-    )
-    panels.append(
-        stat_panel(
-            17,
-            "Ariadne CI Coverage (%)",
-            ARIADNE_CI_COVERAGE,
-            {"h": 6, "w": 4, "x": 8, "y": 11},
-            unit="percent",
-            decimals=1,
-            instant=True,
-            legend="{{branch}}",
-        )
-    )
-    panels.append(
-        table_panel(
-            18,
-            "Ariadne CI Tests (latest)",
-            ARIADNE_CI_TESTS,
-            {"h": 6, "w": 12, "x": 12, "y": 11},
-            unit="none",
-            transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
-            instant=True,
-        )
-    )
-
-    return {
-        "uid": "atlas-jobs",
-        "title": "Atlas Jobs",
-        "folderUid": PRIVATE_FOLDER,
-        "editable": True,
-        "panels": panels,
-        "time": {"from": "now-7d", "to": "now"},
-        "annotations": {"list": []},
-        "schemaVersion": 39,
-        "style": "dark",
-        "tags": ["atlas", "jobs", "glue"],
-    }
-
-
 def build_gpu_dashboard():
    panels = []
    gpu_scope = "$namespace_scope_gpu"
@ -2697,7 +2146,7 @@ def build_gpu_dashboard():
            namespace_gpu_share_expr(gpu_scope),
            {"h": 8, "w": 12, "x": 0, "y": 0},
            links=namespace_scope_links("namespace_scope_gpu"),
-            description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
+            description="Values are normalized within the selected scope; use panel links to switch scope.",
        )
    )
    panels.append(
@ -2716,7 +2165,7 @@ def build_gpu_dashboard():
        timeseries_panel(
            3,
            "GPU Util by Node",
-            gpu_util_by_hostname(),
+            'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
            {"h": 8, "w": 12, "x": 0, "y": 8},
            unit="percent",
            legend="{{Hostname}}",
@ -2780,10 +2229,6 @@ DASHBOARDS = {
        "builder": build_mail_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
    },
-    "atlas-jobs": {
-        "builder": build_jobs_dashboard,
-        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
-    },
    "atlas-gpu": {
        "builder": build_gpu_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
--- a/scripts/knowledge_render_atlas.py
+++ b/scripts/knowledge_render_atlas.py
@ -20,13 +20,11 @@ import subprocess
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-import shutil
 from typing import Any, Iterable

 import yaml

 REPO_ROOT = Path(__file__).resolve().parents[1]
-DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards"

 CLUSTER_SCOPED_KINDS = {
    "Namespace",
@ -62,70 +60,6 @@ def _run(cmd: list[str], *, cwd: Path) -> str:
    return res.stdout


-def _sync_tree(source: Path, dest: Path) -> None:
-    if dest.exists():
-        shutil.rmtree(dest)
-    shutil.copytree(source, dest)
-
-
-def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]:
-    panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else []
-    for panel in panels:
-        if not isinstance(panel, dict):
-            continue
-        if panel.get("type") == "row" and isinstance(panel.get("panels"), list):
-            yield from _iter_dashboard_panels({"panels": panel.get("panels")})
-            continue
-        yield panel
-
-
-def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]:
-    index: list[dict[str, Any]] = []
-    for path in sorted(dashboard_dir.glob("*.json")):
-        try:
-            data = json.loads(path.read_text(encoding="utf-8"))
-        except json.JSONDecodeError:
-            continue
-        if not isinstance(data, dict):
-            continue
-        dash_title = data.get("title") or path.stem
-        dash_tags = data.get("tags") or []
-        for panel in _iter_dashboard_panels(data):
-            targets = panel.get("targets")
-            if not isinstance(targets, list):
-                continue
-            exprs: list[str] = []
-            for target in targets:
-                if not isinstance(target, dict):
-                    continue
-                expr = target.get("expr")
-                if isinstance(expr, str) and expr.strip():
-                    exprs.append(expr.strip())
-            if not exprs:
-                continue
-            datasource = panel.get("datasource") or {}
-            if isinstance(datasource, dict):
-                ds_uid = datasource.get("uid")
-                ds_type = datasource.get("type")
-            else:
-                ds_uid = None
-                ds_type = None
-            index.append(
-                {
-                    "dashboard": dash_title,
-                    "panel_title": panel.get("title") or "",
-                    "panel_id": panel.get("id"),
-                    "panel_type": panel.get("type"),
-                    "description": panel.get("description") or "",
-                    "tags": dash_tags,
-                    "datasource_uid": ds_uid,
-                    "datasource_type": ds_type,
-                    "exprs": exprs,
-                }
-            )
-    return index
-
-
 def kustomize_build(path: Path) -> str:
    rel = path.relative_to(REPO_ROOT)
    try:
@ -538,11 +472,6 @@ def main() -> int:
        action="store_true",
        help="Write generated files (otherwise just print a summary).",
    )
-    ap.add_argument(
-        "--sync-comms",
-        action="store_true",
-        help="Mirror rendered knowledge into services/comms/knowledge for atlasbot.",
-    )
    args = ap.parse_args()

    out_dir = REPO_ROOT / args.out
@ -575,11 +504,8 @@ def main() -> int:
    summary_path = out_dir / "catalog" / "atlas-summary.json"
    diagram_path = out_dir / "diagrams" / "atlas-http.mmd"
    runbooks_json_path = out_dir / "catalog" / "runbooks.json"
-    metrics_json_path = out_dir / "catalog" / "metrics.json"

-    catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix()
    catalog_path.write_text(
-        f"# {catalog_rel}\n"
        "# Generated by scripts/knowledge_render_atlas.py (do not edit by hand)\n"
        + yaml.safe_dump(catalog, sort_keys=False),
        encoding="utf-8",
@ -589,14 +515,9 @@ def main() -> int:
    diagram_path.write_text(diagram, encoding="utf-8")

    # Render runbooks into JSON for lightweight, dependency-free consumption in-cluster.
-    runbook_dirs = [
-        out_dir / "runbooks",
-        out_dir / "software",
-    ]
+    runbooks_dir = out_dir / "runbooks"
    runbooks: list[dict[str, Any]] = []
-    for runbooks_dir in runbook_dirs:
-        if not runbooks_dir.exists():
-            continue
+    if runbooks_dir.exists():
        for md_file in sorted(runbooks_dir.glob("*.md")):
            raw = md_file.read_text(encoding="utf-8")
            fm: dict[str, Any] = {}
@ -620,22 +541,12 @@ def main() -> int:
                }
            )
    runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8")
-    metrics_index = _extract_metrics_index(DASHBOARD_DIR)
-    metrics_json_path.write_text(
-        json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8"
-    )

    print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {summary_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}")
    print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}")
-    print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}")
-
-    if args.sync_comms:
-        comms_dir = REPO_ROOT / "services" / "comms" / "knowledge"
-        _sync_tree(out_dir, comms_dir)
-        print(f"Synced {out_dir.relative_to(REPO_ROOT)} -> {comms_dir.relative_to(REPO_ROOT)}")
    return 0


--- a/scripts/test_atlas_user_cleanup.py
+++ b/scripts/test_atlas_user_cleanup.py
@ -7,8 +7,6 @@ test accounts created via the bstein-dev-home onboarding portal.
 Targets (best-effort):
  - Keycloak users in realm "atlas"
  - Atlas portal Postgres rows (access_requests + dependent tables)
-  - Mailu mailboxes created for test users
-  - Nextcloud Mail accounts created for test users
  - Vaultwarden users/invites created by the portal

 Safety:
@ -58,19 +56,6 @@ class VaultwardenUser:
    status: int


-@dataclass(frozen=True)
-class MailuUser:
-    email: str
-    localpart: str
-    domain: str
-
-
-@dataclass(frozen=True)
-class NextcloudMailAccount:
-    account_id: str
-    email: str
-
-
 def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
    proc = subprocess.run(
        cmd,
@ -85,19 +70,6 @@ def _run(cmd: list[str], *, input_bytes: bytes | None = None) -> str:
    return proc.stdout.decode("utf-8", errors="replace")


-def _run_capture(cmd: list[str], *, input_bytes: bytes | None = None) -> tuple[int, str, str]:
-    proc = subprocess.run(
-        cmd,
-        input=input_bytes,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        check=False,
-    )
-    stdout = proc.stdout.decode("utf-8", errors="replace")
-    stderr = proc.stderr.decode("utf-8", errors="replace")
-    return proc.returncode, stdout, stderr
-
-
 def _kubectl_get_secret_value(namespace: str, name: str, key: str) -> str:
    raw_b64 = _run(
        [
@ -138,21 +110,6 @@ def _kubectl_first_pod(namespace: str) -> str:
    return pod_name


-def _kubectl_exec(namespace: str, target: str, cmd: list[str]) -> tuple[int, str, str]:
-    return _run_capture(
-        [
-            "kubectl",
-            "-n",
-            namespace,
-            "exec",
-            "-i",
-            target,
-            "--",
-            *cmd,
-        ]
-    )
-
-
 def _validate_prefixes(prefixes: list[str]) -> list[str]:
    cleaned: list[str] = []
    for prefix in prefixes:
@ -230,62 +187,6 @@ def _keycloak_delete_user(server: str, realm: str, token: str, user_id: str) ->
        raise


-def _sql_quote(value: str) -> str:
-    return "'" + value.replace("'", "''") + "'"
-
-
-def _psql_exec(db_name: str, sql: str, *, user: str = "postgres") -> str:
-    postgres_pod = _kubectl_first_pod("postgres")
-    return _run(
-        [
-            "kubectl",
-            "-n",
-            "postgres",
-            "exec",
-            "-i",
-            postgres_pod,
-            "--",
-            "psql",
-            "-U",
-            user,
-            "-d",
-            db_name,
-            "-c",
-            sql,
-        ]
-    )
-
-
-def _psql_tsv(db_name: str, sql: str, *, user: str = "postgres") -> list[list[str]]:
-    postgres_pod = _kubectl_first_pod("postgres")
-    out = _run(
-        [
-            "kubectl",
-            "-n",
-            "postgres",
-            "exec",
-            "-i",
-            postgres_pod,
-            "--",
-            "psql",
-            "-U",
-            user,
-            "-d",
-            db_name,
-            "-At",
-            "-F",
-            "\t",
-            "-c",
-            sql,
-        ]
-    )
-    rows: list[list[str]] = []
-    for line in out.splitlines():
-        parts = line.split("\t")
-        rows.append(parts)
-    return rows
-
-
 def _psql_json(portal_db_url: str, sql: str) -> list[dict[str, Any]]:
    postgres_pod = _kubectl_first_pod("postgres")
    out = _run(
@ -355,89 +256,6 @@ def _portal_delete_requests(portal_db_url: str, prefixes: list[str]) -> int:
    return int(match.group(1)) if match else 0


-def _mailu_list_users(prefixes: list[str], domain: str, db_name: str, protected: set[str]) -> list[MailuUser]:
-    if not prefixes or not domain:
-        return []
-    clauses = " OR ".join([f"localpart LIKE '{p}%'" for p in prefixes])
-    sql = (
-        'SELECT email, localpart, domain_name '
-        'FROM "user" '
-        f"WHERE domain_name = {_sql_quote(domain)} AND ({clauses}) "
-        "ORDER BY email;"
-    )
-    rows = _psql_tsv(db_name, sql)
-    users: list[MailuUser] = []
-    for row in rows:
-        if len(row) < 3:
-            continue
-        email = row[0].strip()
-        if not email or email in protected:
-            continue
-        users.append(MailuUser(email=email, localpart=row[1].strip(), domain=row[2].strip()))
-    return users
-
-
-def _mailu_delete_users(db_name: str, emails: list[str]) -> int:
-    if not emails:
-        return 0
-    email_list = ",".join(_sql_quote(e) for e in emails)
-    sql = f'DELETE FROM "user" WHERE email IN ({email_list});'
-    out = _psql_exec(db_name, sql)
-    match = re.search(r"DELETE\\s+(\\d+)", out)
-    return int(match.group(1)) if match else 0
-
-
-_NEXTCLOUD_ACCOUNT_RE = re.compile(r"^Account\\s+(\\d+):")
-_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+")
-
-
-def _nextcloud_exec(cmd: list[str]) -> tuple[int, str, str]:
-    namespace = os.getenv("NEXTCLOUD_NAMESPACE", "nextcloud").strip() or "nextcloud"
-    target = os.getenv("NEXTCLOUD_EXEC_TARGET", "deploy/nextcloud").strip() or "deploy/nextcloud"
-    return _kubectl_exec(namespace, target, cmd)
-
-
-def _parse_nextcloud_mail_accounts(export_output: str) -> list[NextcloudMailAccount]:
-    accounts: list[NextcloudMailAccount] = []
-    current_id = ""
-    for line in export_output.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        match = _NEXTCLOUD_ACCOUNT_RE.match(line)
-        if match:
-            current_id = match.group(1)
-            continue
-        if not current_id or "@" not in line:
-            continue
-        email_match = _EMAIL_RE.search(line)
-        if not email_match:
-            continue
-        accounts.append(NextcloudMailAccount(account_id=current_id, email=email_match.group(0)))
-        current_id = ""
-    return accounts
-
-
-def _nextcloud_list_mail_accounts(username: str) -> list[NextcloudMailAccount]:
-    occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
-    rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:export", username])
-    if rc != 0:
-        message = (err or out).strip()
-        lowered = message.lower()
-        if any(token in lowered for token in ("not found", "does not exist", "no such user", "unknown user")):
-            return []
-        raise RuntimeError(f"nextcloud mail export failed for {username}: {message}")
-    return _parse_nextcloud_mail_accounts(out)
-
-
-def _nextcloud_delete_mail_account(account_id: str) -> None:
-    occ_path = os.getenv("NEXTCLOUD_OCC_PATH", "/var/www/html/occ").strip() or "/var/www/html/occ"
-    rc, out, err = _nextcloud_exec(["php", occ_path, "mail:account:delete", "-q", account_id])
-    if rc != 0:
-        message = (err or out).strip()
-        raise RuntimeError(f"nextcloud mail delete failed for account {account_id}: {message}")
-
-
 def _vaultwarden_admin_cookie(admin_token: str, base_url: str) -> str:
    data = urllib.parse.urlencode({"token": admin_token}).encode("utf-8")
    req = urllib.request.Request(f"{base_url}/admin", data=data, method="POST")
@ -538,8 +356,6 @@ def main() -> int:
        ),
    )
    parser.add_argument("--skip-keycloak", action="store_true", help="Skip Keycloak user deletion.")
-    parser.add_argument("--skip-mailu", action="store_true", help="Skip Mailu mailbox cleanup.")
-    parser.add_argument("--skip-nextcloud-mail", action="store_true", help="Skip Nextcloud Mail account cleanup.")
    parser.add_argument("--skip-portal-db", action="store_true", help="Skip portal DB cleanup.")
    parser.add_argument("--skip-vaultwarden", action="store_true", help="Skip Vaultwarden cleanup.")
    parser.add_argument(
@ -548,18 +364,6 @@ def main() -> int:
        default=[],
        help="Keycloak usernames that must never be deleted (repeatable).",
    )
-    parser.add_argument(
-        "--protect-mailu-email",
-        action="append",
-        default=[],
-        help="Mailu emails that must never be deleted (repeatable).",
-    )
-    parser.add_argument(
-        "--protect-nextcloud-username",
-        action="append",
-        default=[],
-        help="Nextcloud usernames that must never be touched (repeatable).",
-    )
    parser.add_argument(
        "--protect-vaultwarden-email",
        action="append",
@ -572,11 +376,7 @@ def main() -> int:
    apply = bool(args.apply)
    expected_confirm = ",".join(prefixes)
    protected_keycloak = {"bstein", "robotuser", *[u.strip() for u in args.protect_keycloak_username if u.strip()]}
-    protected_mailu = {e.strip() for e in args.protect_mailu_email if e.strip()}
-    protected_nextcloud = {u.strip() for u in args.protect_nextcloud_username if u.strip()}
    protected_vaultwarden = {e.strip() for e in args.protect_vaultwarden_email if e.strip()}
-    mailu_domain = os.getenv("MAILU_DOMAIN", "bstein.dev").strip() or "bstein.dev"
-    mailu_db_name = os.getenv("MAILU_DB_NAME", "mailu").strip() or "mailu"

    if apply and args.confirm != expected_confirm:
        raise SystemExit(
@ -588,29 +388,23 @@ def main() -> int:
    print("mode:", "APPLY (destructive)" if apply else "DRY RUN (no changes)")
    if protected_keycloak:
        print("protected keycloak usernames:", ", ".join(sorted(protected_keycloak)))
-    if protected_mailu:
-        print("protected mailu emails:", ", ".join(sorted(protected_mailu)))
-    if protected_nextcloud:
-        print("protected nextcloud usernames:", ", ".join(sorted(protected_nextcloud)))
    if protected_vaultwarden:
        print("protected vaultwarden emails:", ", ".join(sorted(protected_vaultwarden)))
    print()

-    portal_requests: list[PortalRequestRow] = []
    if not args.skip_portal_db:
        portal_db_url = _kubectl_get_secret_value("bstein-dev-home", "atlas-portal-db", "PORTAL_DATABASE_URL")
-        portal_requests = _portal_list_requests(portal_db_url, prefixes)
-        print(f"Portal DB: {len(portal_requests)} access_requests matched")
-        for row in portal_requests[:50]:
+        requests = _portal_list_requests(portal_db_url, prefixes)
+        print(f"Portal DB: {len(requests)} access_requests matched")
+        for row in requests[:50]:
            print(f"  {row.request_code}\t{row.status}\t{row.username}")
-        if len(portal_requests) > 50:
-            print(f"  ... and {len(portal_requests) - 50} more")
-        if apply and portal_requests:
+        if len(requests) > 50:
+            print(f"  ... and {len(requests) - 50} more")
+        if apply and requests:
            deleted = _portal_delete_requests(portal_db_url, prefixes)
            print(f"Portal DB: deleted {deleted} access_requests (cascade removes tasks/steps/artifacts).")
        print()

-    keycloak_users: list[KeycloakUser] = []
    if not args.skip_keycloak:
        kc_server = os.getenv("KEYCLOAK_PUBLIC_URL", "https://sso.bstein.dev").rstrip("/")
        kc_realm = os.getenv("KEYCLOAK_REALM", "atlas")
@ -627,63 +421,18 @@ def main() -> int:
                if user.username in protected_keycloak:
                    continue
                found[user.user_id] = user
-        keycloak_users = list(found.values())
-        keycloak_users.sort(key=lambda u: u.username)
-        print(f"Keycloak: {len(keycloak_users)} users matched")
-        for user in keycloak_users[:50]:
+        users = list(found.values())
+        users.sort(key=lambda u: u.username)
+        print(f"Keycloak: {len(users)} users matched")
+        for user in users[:50]:
            email = user.email or "-"
            print(f"  {user.username}\t{email}\t{user.user_id}")
-        if len(keycloak_users) > 50:
-            print(f"  ... and {len(keycloak_users) - 50} more")
-        if apply and keycloak_users:
-            for user in keycloak_users:
+        if len(users) > 50:
+            print(f"  ... and {len(users) - 50} more")
+        if apply and users:
+            for user in users:
                _keycloak_delete_user(kc_server, kc_realm, token, user.user_id)
-            print(f"Keycloak: deleted {len(keycloak_users)} users.")
-        print()
-
-    if not args.skip_mailu:
-        mailu_users = _mailu_list_users(prefixes, mailu_domain, mailu_db_name, protected_mailu)
-        print(f"Mailu: {len(mailu_users)} mailboxes matched (domain={mailu_domain})")
-        for user in mailu_users[:50]:
-            print(f"  {user.email}\t{user.localpart}\t{user.domain}")
-        if len(mailu_users) > 50:
-            print(f"  ... and {len(mailu_users) - 50} more")
-        if apply and mailu_users:
-            deleted = _mailu_delete_users(mailu_db_name, [u.email for u in mailu_users])
-            print(f"Mailu: deleted {deleted} mailboxes.")
-        print()
-
-    if not args.skip_nextcloud_mail:
-        nextcloud_usernames = {row.username for row in portal_requests if row.username}
-        nextcloud_usernames.update({u.username for u in keycloak_users if u.username})
-        nextcloud_usernames = {u for u in nextcloud_usernames if _starts_with_any(u, prefixes)}
-        nextcloud_usernames = {u for u in nextcloud_usernames if u not in protected_nextcloud}
-
-        matches: list[tuple[str, NextcloudMailAccount]] = []
-        for username in sorted(nextcloud_usernames):
-            accounts = _nextcloud_list_mail_accounts(username)
-            for account in accounts:
-                email = account.email.strip()
-                if not email:
-                    continue
-                if not email.lower().endswith(f"@{mailu_domain.lower()}"):
-                    continue
-                localpart = email.split("@", 1)[0]
-                if not _starts_with_any(localpart, prefixes):
-                    continue
-                if email in protected_mailu:
-                    continue
-                matches.append((username, account))
-
-        print(f"Nextcloud Mail: {len(matches)} accounts matched")
-        for username, account in matches[:50]:
-            print(f"  {username}\t{account.account_id}\t{account.email}")
-        if len(matches) > 50:
-            print(f"  ... and {len(matches) - 50} more")
-        if apply and matches:
-            for _, account in matches:
-                _nextcloud_delete_mail_account(account.account_id)
-            print(f"Nextcloud Mail: deleted {len(matches)} accounts.")
+            print(f"Keycloak: deleted {len(users)} users.")
        print()

    if not args.skip_vaultwarden:
--- a/scripts/tests/test_mailu_sync.py
+++ b/scripts/tests/test_mailu_sync.py
@ -55,11 +55,11 @@ class _FakeResponse:


 class _FakeSession:
-    def __init__(self, put_resp, get_resps):
+    def __init__(self, put_resp, get_resp):
        self.put_resp = put_resp
-        self.get_resps = list(get_resps)
+        self.get_resp = get_resp
        self.put_called = False
-        self.get_calls = 0
+        self.get_called = False

    def post(self, *args, **kwargs):
        return _FakeResponse({"access_token": "dummy"})
@ -69,26 +69,22 @@ class _FakeSession:
        return self.put_resp

    def get(self, *args, **kwargs):
-        self.get_calls += 1
-        if self.get_resps:
-            return self.get_resps.pop(0)
-        return _FakeResponse({})
+        self.get_called = True
+        return self.get_resp


 def test_kc_update_attributes_succeeds(monkeypatch):
    sync = load_sync_module(monkeypatch)
-    current_resp = _FakeResponse({"attributes": {}})
    ok_resp = _FakeResponse({"attributes": {"mailu_app_password": ["abc"]}})
-    sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, ok_resp])
+    sync.SESSION = _FakeSession(_FakeResponse({}), ok_resp)
    sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})
-    assert sync.SESSION.put_called and sync.SESSION.get_calls == 2
+    assert sync.SESSION.put_called and sync.SESSION.get_called


 def test_kc_update_attributes_raises_without_attribute(monkeypatch):
    sync = load_sync_module(monkeypatch)
-    current_resp = _FakeResponse({"attributes": {}})
    missing_attr_resp = _FakeResponse({"attributes": {}}, status=200)
-    sync.SESSION = _FakeSession(_FakeResponse({}), [current_resp, missing_attr_resp])
+    sync.SESSION = _FakeSession(_FakeResponse({}), missing_attr_resp)
    with pytest.raises(Exception):
        sync.kc_update_attributes("token", {"id": "u1", "username": "u1"}, {"mailu_app_password": "abc"})

@ -148,25 +144,9 @@ def test_main_generates_password_and_upserts(monkeypatch):
    sync = load_sync_module(monkeypatch)
    monkeypatch.setattr(sync.bcrypt_sha256, "hash", lambda password: f"hash:{password}")
    users = [
-        {
-            "id": "u1",
-            "username": "user1",
-            "email": "user1@example.com",
-            "attributes": {"mailu_enabled": ["true"]},
-        },
-        {
-            "id": "u2",
-            "username": "user2",
-            "email": "user2@example.com",
-            "attributes": {"mailu_app_password": ["keepme"], "mailu_enabled": ["true"]},
-        },
-        {
-            "id": "u3",
-            "username": "user3",
-            "email": "user3@example.com",
-            "attributes": {"mailu_email": ["user3@example.com"]},
-        },
-        {"id": "u4", "username": "user4", "email": "user4@other.com", "attributes": {}},
+        {"id": "u1", "username": "user1", "email": "user1@example.com", "attributes": {}},
+        {"id": "u2", "username": "user2", "email": "user2@example.com", "attributes": {"mailu_app_password": ["keepme"]}},
+        {"id": "u3", "username": "user3", "email": "user3@other.com", "attributes": {}},
    ]
    updated = []

@ -205,6 +185,6 @@ def test_main_generates_password_and_upserts(monkeypatch):

    sync.main()

-    # Only mail-enabled users (or legacy users with a mailbox) are synced and backfilled.
+    # Always backfill mailu_email, even if Keycloak recovery email is external.
    assert len(updated) == 3
    assert conns and len(conns[0]._cursor.executions) == 3
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@ -20,9 +20,8 @@ spec:
      labels:
        app: ollama
      annotations:
-        ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
-        ai.bstein.dev/gpu: GPU pool (titan-22/24)
-        ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
+        ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
+        ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
    spec:
      affinity:
        nodeAffinity:
@ -32,6 +31,8 @@ spec:
                  - key: kubernetes.io/hostname
                    operator: In
                    values:
+                      - titan-20
+                      - titan-21
                      - titan-22
                      - titan-24
      runtimeClassName: nvidia
@ -41,7 +42,7 @@ spec:
            claimName: ollama-models
      initContainers:
        - name: warm-model
-          image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
+          image: ollama/ollama:latest
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
@ -52,7 +53,7 @@ spec:
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
-              value: qwen2.5:14b-instruct-q4_0
+              value: qwen2.5-coder:7b-instruct-q4_0
          command:
            - /bin/sh
            - -c
@ -67,14 +68,14 @@ spec:
              mountPath: /root/.ollama
          resources:
            requests:
-              cpu: 500m
-              memory: 2Gi
+              cpu: 250m
+              memory: 1Gi
              nvidia.com/gpu.shared: 1
            limits:
              nvidia.com/gpu.shared: 1
      containers:
        - name: ollama
-          image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
+          image: ollama/ollama:latest
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
@ -95,10 +96,10 @@ spec:
              mountPath: /root/.ollama
          resources:
            requests:
-              cpu: "4"
-              memory: 16Gi
+              cpu: "2"
+              memory: 8Gi
              nvidia.com/gpu.shared: 1
            limits:
-              cpu: "8"
-              memory: 24Gi
+              cpu: "4"
+              memory: 12Gi
              nvidia.com/gpu.shared: 1
--- a/Show More
+++ b/Show More