iac: externalize ConfigMap scripts

2026-01-13 09:59:39 -03:00 · 2026-01-13 09:59:39 -03:00 · 6da576a707
commit 6da576a707
parent 17b733c65e
47 changed files with 1971 additions and 2032 deletions
--- a/services/bstein-dev-home/chat-ai-gateway-configmap.yaml
+++ b/services/bstein-dev-home/chat-ai-gateway-configmap.yaml
@ -1,78 +0,0 @@
-# services/bstein-dev-home/chat-ai-gateway-configmap.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chat-ai-gateway
-  namespace: bstein-dev-home
-data:
-  gateway.py: |
-    import json
-    import os
-    from http.server import BaseHTTPRequestHandler, HTTPServer
-    from urllib import request, error
-
-    UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
-    KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
-    KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
-
-    ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
-
-    class Handler(BaseHTTPRequestHandler):
-        def _send_json(self, code: int, payload: dict):
-            body = json.dumps(payload).encode()
-            self.send_response(code)
-            self.send_header("Content-Type", "application/json")
-            self.send_header("Content-Length", str(len(body)))
-            self.end_headers()
-            self.wfile.write(body)
-
-        def do_GET(self):  # noqa: N802
-            if self.path in ("/healthz", "/"):
-                return self._send_json(200, {"ok": True})
-            return self._send_json(404, {"error": "not_found"})
-
-        def do_POST(self):  # noqa: N802
-            if self.path != "/":
-                return self._send_json(404, {"error": "not_found"})
-
-            key = self.headers.get("x-api-key", "")
-            if not key or key not in ALLOWED:
-                return self._send_json(401, {"error": "unauthorized"})
-
-            length = int(self.headers.get("content-length", "0") or "0")
-            raw = self.rfile.read(length) if length else b"{}"
-
-            try:
-                upstream_req = request.Request(
-                    UPSTREAM,
-                    data=raw,
-                    headers={"Content-Type": "application/json"},
-                    method="POST",
-                )
-                with request.urlopen(upstream_req, timeout=90) as resp:
-                    data = resp.read()
-                    self.send_response(resp.status)
-                    for k, v in resp.headers.items():
-                        if k.lower() in ("content-length", "connection", "server", "date"):
-                            continue
-                        self.send_header(k, v)
-                    self.send_header("Content-Length", str(len(data)))
-                    self.end_headers()
-                    self.wfile.write(data)
-            except error.HTTPError as e:
-                data = e.read() if hasattr(e, "read") else b""
-                self.send_response(e.code)
-                self.send_header("Content-Type", "application/json")
-                self.send_header("Content-Length", str(len(data)))
-                self.end_headers()
-                self.wfile.write(data)
-            except Exception:
-                return self._send_json(502, {"error": "bad_gateway"})
-
-    def main():
-        port = int(os.environ.get("PORT", "8080"))
-        httpd = HTTPServer(("0.0.0.0", port), Handler)
-        httpd.serve_forever()
-
-    if __name__ == "__main__":
-        main()
--- a/services/bstein-dev-home/kustomization.yaml
+++ b/services/bstein-dev-home/kustomization.yaml
@ -7,7 +7,6 @@ resources:
  - image.yaml
  - rbac.yaml
  - portal-e2e-client-secret-sync-rbac.yaml
-  - chat-ai-gateway-configmap.yaml
  - chat-ai-gateway-deployment.yaml
  - chat-ai-gateway-service.yaml
  - frontend-deployment.yaml
@ -19,15 +18,21 @@ resources:
  - ingress.yaml

 configMapGenerator:
+  - name: chat-ai-gateway
+    namespace: bstein-dev-home
+    files:
+      - gateway.py=scripts/gateway.py
+    options:
+      disableNameSuffixHash: true
  - name: vaultwarden-cred-sync-script
    namespace: bstein-dev-home
    files:
-      - vaultwarden_cred_sync.py=../../scripts/vaultwarden_cred_sync.py
+      - vaultwarden_cred_sync.py=scripts/vaultwarden_cred_sync.py
    options:
      disableNameSuffixHash: true
  - name: portal-onboarding-e2e-tests
    namespace: bstein-dev-home
    files:
-      - test_portal_onboarding_flow.py=../../scripts/tests/test_portal_onboarding_flow.py
+      - test_portal_onboarding_flow.py=scripts/test_portal_onboarding_flow.py
    options:
      disableNameSuffixHash: true
--- a/services/bstein-dev-home/scripts/gateway.py
+++ b/services/bstein-dev-home/scripts/gateway.py
@ -0,0 +1,70 @@
+import json
+import os
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from urllib import request, error
+
+UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat")
+KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "")
+KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "")
+
+ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k}
+
+class Handler(BaseHTTPRequestHandler):
+    def _send_json(self, code: int, payload: dict):
+        body = json.dumps(payload).encode()
+        self.send_response(code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def do_GET(self):  # noqa: N802
+        if self.path in ("/healthz", "/"):
+            return self._send_json(200, {"ok": True})
+        return self._send_json(404, {"error": "not_found"})
+
+    def do_POST(self):  # noqa: N802
+        if self.path != "/":
+            return self._send_json(404, {"error": "not_found"})
+
+        key = self.headers.get("x-api-key", "")
+        if not key or key not in ALLOWED:
+            return self._send_json(401, {"error": "unauthorized"})
+
+        length = int(self.headers.get("content-length", "0") or "0")
+        raw = self.rfile.read(length) if length else b"{}"
+
+        try:
+            upstream_req = request.Request(
+                UPSTREAM,
+                data=raw,
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            with request.urlopen(upstream_req, timeout=90) as resp:
+                data = resp.read()
+                self.send_response(resp.status)
+                for k, v in resp.headers.items():
+                    if k.lower() in ("content-length", "connection", "server", "date"):
+                        continue
+                    self.send_header(k, v)
+                self.send_header("Content-Length", str(len(data)))
+                self.end_headers()
+                self.wfile.write(data)
+        except error.HTTPError as e:
+            data = e.read() if hasattr(e, "read") else b""
+            self.send_response(e.code)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        except Exception:
+            return self._send_json(502, {"error": "bad_gateway"})
+
+def main():
+    port = int(os.environ.get("PORT", "8080"))
+    httpd = HTTPServer(("0.0.0.0", port), Handler)
+    httpd.serve_forever()
+
+if __name__ == "__main__":
+    main()
--- a/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
+++ b/services/bstein-dev-home/scripts/test_portal_onboarding_flow.py
--- a/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
+++ b/services/bstein-dev-home/scripts/vaultwarden_cred_sync.py
--- a/services/comms/atlasbot-configmap.yaml
+++ b/services/comms/atlasbot-configmap.yaml
@ -1,629 +0,0 @@
-# services/comms/atlasbot-configmap.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: atlasbot
-data:
-  bot.py: |
-    import collections
-    import json
-    import os
-    import re
-    import ssl
-    import time
-    from typing import Any
-    from urllib import error, parse, request
-
-    BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
-    AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
-    USER = os.environ["BOT_USER"]
-    PASSWORD = os.environ["BOT_PASS"]
-    ROOM_ALIAS = "#othrys:live.bstein.dev"
-
-    OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
-    MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
-    API_KEY = os.environ.get("CHAT_API_KEY", "")
-
-    KB_DIR = os.environ.get("KB_DIR", "")
-    VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
-
-    BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
-    SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
-
-    MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
-    MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
-
-    TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE)
-    HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)")
-    STOPWORDS = {
-        "the",
-        "and",
-        "for",
-        "with",
-        "this",
-        "that",
-        "from",
-        "into",
-        "what",
-        "how",
-        "why",
-        "when",
-        "where",
-        "which",
-        "who",
-        "can",
-        "could",
-        "should",
-        "would",
-        "please",
-        "help",
-        "atlas",
-        "othrys",
-    }
-
-    METRIC_HINT_WORDS = {
-        "health",
-        "status",
-        "down",
-        "slow",
-        "error",
-        "unknown_error",
-        "timeout",
-        "crash",
-        "crashloop",
-        "restart",
-        "restarts",
-        "pending",
-        "unreachable",
-        "latency",
-    }
-
-    def _tokens(text: str) -> list[str]:
-        toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
-        return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
-
-
-    # Mention detection (Matrix rich mentions + plain @atlas).
-    MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()]
-    MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS]
-    MENTION_RE = re.compile(
-        r"(?<!\\w)@(?:" + "|".join(re.escape(m) for m in MENTION_LOCALPARTS) + r")(?:\\:[^\\s]+)?(?!\\w)",
-        re.IGNORECASE,
-    )
-
-    def normalize_user_id(token: str) -> str:
-        t = token.strip()
-        if not t:
-            return ""
-        if t.startswith("@") and ":" in t:
-            return t
-        t = t.lstrip("@")
-        if ":" in t:
-            return f"@{t}"
-        return f"@{t}:{SERVER_NAME}"
-
-    MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)}
-
-    def is_mentioned(content: dict, body: str) -> bool:
-        if MENTION_RE.search(body or "") is not None:
-            return True
-        mentions = content.get("m.mentions", {})
-        user_ids = mentions.get("user_ids", [])
-        if not isinstance(user_ids, list):
-            return False
-        return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids)
-
-
-    # Matrix HTTP helper.
-    def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
-        url = (base or BASE) + path
-        data = None
-        headers = {}
-        if body is not None:
-            data = json.dumps(body).encode()
-            headers["Content-Type"] = "application/json"
-        if token:
-            headers["Authorization"] = f"Bearer {token}"
-        r = request.Request(url, data=data, headers=headers, method=method)
-        with request.urlopen(r, timeout=timeout) as resp:
-            raw = resp.read()
-            return json.loads(raw.decode()) if raw else {}
-
-    def login() -> str:
-        login_user = normalize_user_id(USER)
-        payload = {
-            "type": "m.login.password",
-            "identifier": {"type": "m.id.user", "user": login_user},
-            "password": PASSWORD,
-        }
-        res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
-        return res["access_token"]
-
-    def resolve_alias(token: str, alias: str) -> str:
-        enc = parse.quote(alias)
-        res = req("GET", f"/_matrix/client/v3/directory/room/{enc}", token)
-        return res["room_id"]
-
-    def join_room(token: str, room: str):
-        req("POST", f"/_matrix/client/v3/rooms/{parse.quote(room)}/join", token, body={})
-
-    def send_msg(token: str, room: str, text: str):
-        path = f"/_matrix/client/v3/rooms/{parse.quote(room)}/send/m.room.message"
-        req("POST", path, token, body={"msgtype": "m.text", "body": text})
-
-
-    # Atlas KB loader (no external deps; files are pre-rendered JSON via scripts/knowledge_render_atlas.py).
-    KB = {"catalog": {}, "runbooks": []}
-    _HOST_INDEX: dict[str, list[dict]] = {}
-    _NAME_INDEX: set[str] = set()
-
-    def _load_json_file(path: str) -> Any | None:
-        try:
-            with open(path, "rb") as f:
-                return json.loads(f.read().decode("utf-8"))
-        except Exception:
-            return None
-
-    def load_kb():
-        global KB, _HOST_INDEX, _NAME_INDEX
-        if not KB_DIR:
-            return
-        catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {}
-        runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or []
-        KB = {"catalog": catalog, "runbooks": runbooks}
-
-        host_index: dict[str, list[dict]] = collections.defaultdict(list)
-        for ep in catalog.get("http_endpoints", []) if isinstance(catalog, dict) else []:
-            host = (ep.get("host") or "").lower()
-            if host:
-                host_index[host].append(ep)
-        _HOST_INDEX = {k: host_index[k] for k in sorted(host_index.keys())}
-
-        names: set[str] = set()
-        for s in catalog.get("services", []) if isinstance(catalog, dict) else []:
-            if isinstance(s, dict) and s.get("name"):
-                names.add(str(s["name"]).lower())
-        for w in catalog.get("workloads", []) if isinstance(catalog, dict) else []:
-            if isinstance(w, dict) and w.get("name"):
-                names.add(str(w["name"]).lower())
-        _NAME_INDEX = names
-
-    def kb_retrieve(query: str, *, limit: int = 3) -> str:
-        q = (query or "").strip()
-        if not q or not KB.get("runbooks"):
-            return ""
-        ql = q.lower()
-        q_tokens = _tokens(q)
-        if not q_tokens:
-            return ""
-
-        scored: list[tuple[int, dict]] = []
-        for doc in KB.get("runbooks", []):
-            if not isinstance(doc, dict):
-                continue
-            title = str(doc.get("title") or "")
-            body = str(doc.get("body") or "")
-            tags = doc.get("tags") or []
-            entrypoints = doc.get("entrypoints") or []
-            hay = (title + "\n" + " ".join(tags) + "\n" + " ".join(entrypoints) + "\n" + body).lower()
-            score = 0
-            for t in set(q_tokens):
-                if t in hay:
-                    score += 3 if t in title.lower() else 1
-            for h in entrypoints:
-                if isinstance(h, str) and h.lower() in ql:
-                    score += 4
-            if score:
-                scored.append((score, doc))
-
-        scored.sort(key=lambda x: x[0], reverse=True)
-        picked = [d for _, d in scored[:limit]]
-        if not picked:
-            return ""
-
-        parts: list[str] = ["Atlas KB (retrieved):"]
-        used = 0
-        for d in picked:
-            path = d.get("path") or ""
-            title = d.get("title") or path
-            body = (d.get("body") or "").strip()
-            snippet = body[:900].strip()
-            chunk = f"- {title} ({path})\n{snippet}"
-            if used + len(chunk) > MAX_KB_CHARS:
-                break
-            parts.append(chunk)
-            used += len(chunk)
-        return "\n".join(parts).strip()
-
-    def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
-        q = (query or "").strip()
-        if not q or not KB.get("catalog"):
-            return "", []
-        ql = q.lower()
-        hosts = {m.group(1).lower() for m in HOST_RE.finditer(ql) if m.group(1).lower().endswith("bstein.dev")}
-
-        # Also match by known workload/service names.
-        for t in _tokens(ql):
-            if t in _NAME_INDEX:
-                hosts |= {ep["host"].lower() for ep in KB["catalog"].get("http_endpoints", []) if isinstance(ep, dict) and ep.get("backend", {}).get("service") == t}
-
-        edges: list[tuple[str, str]] = []
-        lines: list[str] = []
-        for host in sorted(hosts):
-            for ep in _HOST_INDEX.get(host, []):
-                backend = ep.get("backend") or {}
-                ns = backend.get("namespace") or ""
-                svc = backend.get("service") or ""
-                path = ep.get("path") or "/"
-                if not svc:
-                    continue
-                wk = backend.get("workloads") or []
-                wk_str = ", ".join(f"{w.get('kind')}:{w.get('name')}" for w in wk if isinstance(w, dict) and w.get("name")) or "unknown"
-                lines.append(f"- {host}{path} → {ns}/{svc} → {wk_str}")
-                for w in wk:
-                    if isinstance(w, dict) and w.get("name"):
-                        edges.append((ns, str(w["name"])))
-        if not lines:
-            return "", []
-        return "Atlas endpoints (from GitOps):\n" + "\n".join(lines[:20]), edges
-
-
-    # Kubernetes API (read-only). RBAC is provided via ServiceAccount atlasbot.
-    _K8S_TOKEN: str | None = None
-    _K8S_CTX: ssl.SSLContext | None = None
-
-    def _k8s_context() -> ssl.SSLContext:
-        global _K8S_CTX
-        if _K8S_CTX is not None:
-            return _K8S_CTX
-        ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
-        ctx = ssl.create_default_context(cafile=ca_path)
-        _K8S_CTX = ctx
-        return ctx
-
-    def _k8s_token() -> str:
-        global _K8S_TOKEN
-        if _K8S_TOKEN:
-            return _K8S_TOKEN
-        token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
-        with open(token_path, "r", encoding="utf-8") as f:
-            _K8S_TOKEN = f.read().strip()
-        return _K8S_TOKEN
-
-    def k8s_get(path: str, timeout: int = 8) -> dict:
-        host = os.environ.get("KUBERNETES_SERVICE_HOST")
-        port = os.environ.get("KUBERNETES_SERVICE_PORT_HTTPS") or os.environ.get("KUBERNETES_SERVICE_PORT") or "443"
-        if not host:
-            raise RuntimeError("k8s host missing")
-        url = f"https://{host}:{port}{path}"
-        headers = {"Authorization": f"Bearer {_k8s_token()}"}
-        r = request.Request(url, headers=headers, method="GET")
-        with request.urlopen(r, timeout=timeout, context=_k8s_context()) as resp:
-            raw = resp.read()
-            return json.loads(raw.decode()) if raw else {}
-
-    def k8s_pods(namespace: str) -> list[dict]:
-        data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500")
-        items = data.get("items") or []
-        return items if isinstance(items, list) else []
-
-    def summarize_pods(namespace: str, prefixes: set[str] | None = None) -> str:
-        try:
-            pods = k8s_pods(namespace)
-        except Exception:
-            return ""
-        out: list[str] = []
-        for p in pods:
-            md = p.get("metadata") or {}
-            st = p.get("status") or {}
-            name = md.get("name") or ""
-            if prefixes and not any(name.startswith(pref + "-") or name == pref or name.startswith(pref) for pref in prefixes):
-                continue
-            phase = st.get("phase") or "?"
-            cs = st.get("containerStatuses") or []
-            restarts = 0
-            ready = 0
-            total = 0
-            reason = st.get("reason") or ""
-            for c in cs if isinstance(cs, list) else []:
-                if not isinstance(c, dict):
-                    continue
-                total += 1
-                restarts += int(c.get("restartCount") or 0)
-                if c.get("ready"):
-                    ready += 1
-                state = c.get("state") or {}
-                if not reason and isinstance(state, dict):
-                    waiting = state.get("waiting") or {}
-                    if isinstance(waiting, dict) and waiting.get("reason"):
-                        reason = waiting.get("reason")
-            extra = f" ({reason})" if reason else ""
-            out.append(f"- {namespace}/{name}: {phase} {ready}/{total} restarts={restarts}{extra}")
-        return "\n".join(out[:20])
-
-    def flux_not_ready() -> str:
-        try:
-            data = k8s_get(
-                "/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations?limit=200"
-            )
-        except Exception:
-            return ""
-        items = data.get("items") or []
-        bad: list[str] = []
-        for it in items if isinstance(items, list) else []:
-            md = it.get("metadata") or {}
-            st = it.get("status") or {}
-            name = md.get("name") or ""
-            conds = st.get("conditions") or []
-            ready = None
-            msg = ""
-            for c in conds if isinstance(conds, list) else []:
-                if isinstance(c, dict) and c.get("type") == "Ready":
-                    ready = c.get("status")
-                    msg = c.get("message") or ""
-            if ready not in ("True", True):
-                bad.append(f"- flux kustomization/{name}: Ready={ready} {msg}".strip())
-        return "\n".join(bad[:10])
-
-
-    # VictoriaMetrics (PromQL) helpers.
-    def vm_query(query: str, timeout: int = 8) -> dict | None:
-        try:
-            url = VM_URL.rstrip("/") + "/api/v1/query?" + parse.urlencode({"query": query})
-            with request.urlopen(url, timeout=timeout) as resp:
-                return json.loads(resp.read().decode())
-        except Exception:
-            return None
-
-    def _vm_value_series(res: dict) -> list[dict]:
-        if not res or (res.get("status") != "success"):
-            return []
-        data = res.get("data") or {}
-        result = data.get("result") or []
-        return result if isinstance(result, list) else []
-
-    def vm_render_result(res: dict | None, limit: int = 12) -> str:
-        if not res:
-            return ""
-        series = _vm_value_series(res)
-        if not series:
-            return ""
-        out: list[str] = []
-        for r in series[:limit]:
-            if not isinstance(r, dict):
-                continue
-            metric = r.get("metric") or {}
-            value = r.get("value") or []
-            val = value[1] if isinstance(value, list) and len(value) > 1 else ""
-            # Prefer common labels if present.
-            label_parts = []
-            for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"):
-                if isinstance(metric, dict) and metric.get(k):
-                    label_parts.append(f"{k}={metric.get(k)}")
-            if not label_parts and isinstance(metric, dict):
-                for k in sorted(metric.keys()):
-                    if k.startswith("__"):
-                        continue
-                    label_parts.append(f"{k}={metric.get(k)}")
-                    if len(label_parts) >= 4:
-                        break
-            labels = ", ".join(label_parts) if label_parts else "series"
-            out.append(f"- {labels}: {val}")
-        return "\n".join(out)
-
-    def vm_top_restarts(hours: int = 1) -> str:
-        q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
-        res = vm_query(q)
-        if not res or (res.get("status") != "success"):
-            return ""
-        out: list[str] = []
-        for r in (res.get("data") or {}).get("result") or []:
-            if not isinstance(r, dict):
-                continue
-            m = r.get("metric") or {}
-            v = r.get("value") or []
-            ns = (m.get("namespace") or "").strip()
-            pod = (m.get("pod") or "").strip()
-            val = v[1] if isinstance(v, list) and len(v) > 1 else ""
-            if pod:
-                out.append(f"- restarts({hours}h): {ns}/{pod} = {val}")
-        return "\n".join(out)
-
-    def vm_cluster_snapshot() -> str:
-        parts: list[str] = []
-        # Node readiness (kube-state-metrics).
-        ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="true"})')
-        not_ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="false"})')
-        if ready and not_ready:
-            try:
-                r = _vm_value_series(ready)[0]["value"][1]
-                nr = _vm_value_series(not_ready)[0]["value"][1]
-                parts.append(f"- nodes ready: {r} (not ready: {nr})")
-            except Exception:
-                pass
-
-        phases = vm_query("sum by (phase) (kube_pod_status_phase)")
-        pr = vm_render_result(phases, limit=8)
-        if pr:
-            parts.append("Pod phases:")
-            parts.append(pr)
-        return "\n".join(parts).strip()
-
-
-    # Conversation state.
-    history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (short transcript)
-
-    def key_for(room_id: str, sender: str, is_dm: bool):
-        return (room_id, None) if is_dm else (room_id, sender)
-
-    def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str:
-        parts: list[str] = []
-
-        kb = kb_retrieve(prompt)
-        if kb:
-            parts.append(kb)
-
-        endpoints, edges = catalog_hints(prompt)
-        if endpoints:
-            parts.append(endpoints)
-
-        if allow_tools:
-            # Scope pod summaries to relevant namespaces/workloads when possible.
-            prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set)
-            for ns, name in (targets or []) + (edges or []):
-                if ns and name:
-                    prefixes_by_ns[ns].add(name)
-            pod_lines: list[str] = []
-            for ns in sorted(prefixes_by_ns.keys()):
-                summary = summarize_pods(ns, prefixes_by_ns[ns])
-                if summary:
-                    pod_lines.append(f"Pods (live):\n{summary}")
-            if pod_lines:
-                parts.append("\n".join(pod_lines)[:MAX_TOOL_CHARS])
-
-            flux_bad = flux_not_ready()
-            if flux_bad:
-                parts.append("Flux (not ready):\n" + flux_bad)
-
-            p_l = (prompt or "").lower()
-            if any(w in p_l for w in METRIC_HINT_WORDS):
-                restarts = vm_top_restarts(1)
-                if restarts:
-                    parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
-                snap = vm_cluster_snapshot()
-                if snap:
-                    parts.append("VictoriaMetrics (cluster snapshot):\n" + snap)
-
-        return "\n\n".join([p for p in parts if p]).strip()
-
-    def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
-        try:
-            system = (
-                "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
-                "Be helpful, direct, and concise. "
-                "Prefer answering with exact repo paths and Kubernetes resource names. "
-                "Never include or request secret values."
-            )
-            transcript_parts = [system]
-            if context:
-                transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
-            transcript_parts.extend(history[hist_key][-24:])
-            transcript_parts.append(f"User: {prompt}")
-            transcript = "\n".join(transcript_parts)
-
-            payload = {"model": MODEL, "message": transcript}
-            headers = {"Content-Type": "application/json"}
-            if API_KEY:
-                headers["x-api-key"] = API_KEY
-            r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-            with request.urlopen(r, timeout=20) as resp:
-                data = json.loads(resp.read().decode())
-                reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help."
-                history[hist_key].append(f"Atlas: {reply}")
-                return reply
-        except Exception:
-            return "I’m here — but I couldn’t reach the model backend."
-
-    def sync_loop(token: str, room_id: str):
-        since = None
-        try:
-            res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
-            since = res.get("next_batch")
-        except Exception:
-            pass
-
-        while True:
-            params = {"timeout": 30000}
-            if since:
-                params["since"] = since
-            query = parse.urlencode(params)
-            try:
-                res = req("GET", f"/_matrix/client/v3/sync?{query}", token, timeout=35)
-            except Exception:
-                time.sleep(5)
-                continue
-            since = res.get("next_batch", since)
-
-            # invites
-            for rid, data in res.get("rooms", {}).get("invite", {}).items():
-                try:
-                    join_room(token, rid)
-                except Exception:
-                    pass
-
-            # messages
-            for rid, data in res.get("rooms", {}).get("join", {}).items():
-                timeline = data.get("timeline", {}).get("events", [])
-                joined_count = data.get("summary", {}).get("m.joined_member_count")
-                is_dm = joined_count is not None and joined_count <= 2
-
-                for ev in timeline:
-                    if ev.get("type") != "m.room.message":
-                        continue
-                    content = ev.get("content", {})
-                    body = (content.get("body", "") or "").strip()
-                    if not body:
-                        continue
-                    sender = ev.get("sender", "")
-                    if sender == f"@{USER}:live.bstein.dev":
-                        continue
-
-                    mentioned = is_mentioned(content, body)
-                    hist_key = key_for(rid, sender, is_dm)
-                    history[hist_key].append(f"{sender}: {body}")
-                    history[hist_key] = history[hist_key][-80:]
-
-                    if not (is_dm or mentioned):
-                        continue
-
-                    # Only do live cluster/metrics introspection in DMs.
-                    allow_tools = is_dm
-
-                    promql = ""
-                    if allow_tools:
-                        m = re.match(r"(?is)^\\s*promql\\s*(?:\\:|\\s)\\s*(.+?)\\s*$", body)
-                        if m:
-                            promql = m.group(1).strip()
-
-                    # Attempt to scope tools to the most likely workloads when hostnames are mentioned.
-                    targets: list[tuple[str, str]] = []
-                    for m in HOST_RE.finditer(body.lower()):
-                        host = m.group(1).lower()
-                        for ep in _HOST_INDEX.get(host, []):
-                            backend = ep.get("backend") or {}
-                            ns = backend.get("namespace") or ""
-                            for w in backend.get("workloads") or []:
-                                if isinstance(w, dict) and w.get("name"):
-                                    targets.append((ns, str(w["name"])))
-
-                    context = build_context(body, allow_tools=allow_tools, targets=targets)
-                    if allow_tools and promql:
-                        res = vm_query(promql, timeout=20)
-                        rendered = vm_render_result(res, limit=15) or "(no results)"
-                        extra = "VictoriaMetrics (PromQL result):\n" + rendered
-                        context = (context + "\n\n" + extra).strip() if context else extra
-                    reply = ollama_reply(hist_key, body, context=context)
-                    send_msg(token, rid, reply)
-
-    def login_with_retry():
-        last_err = None
-        for attempt in range(10):
-            try:
-                return login()
-            except Exception as exc:  # noqa: BLE001
-                last_err = exc
-                time.sleep(min(30, 2 ** attempt))
-        raise last_err
-
-    def main():
-        load_kb()
-        token = login_with_retry()
-        try:
-            room_id = resolve_alias(token, ROOM_ALIAS)
-            join_room(token, room_id)
-        except Exception:
-            room_id = None
-        sync_loop(token, room_id)
-
-    if __name__ == "__main__":
-        main()
--- a/services/comms/guest-register-configmap.yaml
+++ b/services/comms/guest-register-configmap.yaml
@ -1,271 +0,0 @@
-# services/comms/guest-register-configmap.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: matrix-guest-register
-data:
-  server.py: |
-    import base64
-    import json
-    import os
-    import random
-    import secrets
-    from http.server import BaseHTTPRequestHandler, HTTPServer
-    from urllib import error, parse, request
-
-    MAS_BASE = os.environ.get("MAS_BASE", "http://matrix-authentication-service:8080").rstrip("/")
-    MAS_ADMIN_API_BASE = os.environ.get("MAS_ADMIN_API_BASE", "http://matrix-authentication-service:8081/api/admin/v1").rstrip("/")
-    SYNAPSE_BASE = os.environ.get("SYNAPSE_BASE", "http://othrys-synapse-matrix-synapse:8008").rstrip("/")
-    SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
-
-    MAS_ADMIN_CLIENT_ID = os.environ["MAS_ADMIN_CLIENT_ID"]
-    MAS_ADMIN_CLIENT_SECRET_FILE = os.environ.get("MAS_ADMIN_CLIENT_SECRET_FILE", "/etc/mas/admin-client/client_secret")
-    MAS_ADMIN_SCOPE = os.environ.get("MAS_ADMIN_SCOPE", "urn:mas:admin")
-    RATE_WINDOW_SEC = int(os.environ.get("RATE_WINDOW_SEC", "60"))
-    RATE_MAX = int(os.environ.get("RATE_MAX", "30"))
-    _rate = {}  # ip -> [window_start, count]
-
-    ADJ = [
-        "brisk","calm","eager","gentle","merry","nifty","rapid","sunny","witty","zesty",
-        "amber","bold","bright","crisp","daring","frosty","glad","jolly","lively","mellow",
-        "quiet","ripe","serene","spry","tidy","vivid","warm","wild","clever","kind",
-    ]
-    NOUN = [
-        "otter","falcon","comet","ember","grove","harbor","meadow","raven","river","summit",
-        "breeze","cedar","cinder","cove","delta","forest","glade","lark","marsh","peak",
-        "pine","quartz","reef","ridge","sable","sage","shore","thunder","vale","zephyr",
-    ]
-
-    def _json(method, url, *, headers=None, body=None, timeout=20):
-        hdrs = {"Content-Type": "application/json"}
-        if headers:
-            hdrs.update(headers)
-        data = None
-        if body is not None:
-            data = json.dumps(body).encode()
-        req = request.Request(url, data=data, headers=hdrs, method=method)
-        try:
-            with request.urlopen(req, timeout=timeout) as resp:
-                raw = resp.read()
-                payload = json.loads(raw.decode()) if raw else {}
-                return resp.status, payload
-        except error.HTTPError as e:
-            raw = e.read()
-            try:
-                payload = json.loads(raw.decode()) if raw else {}
-            except Exception:
-                payload = {}
-            return e.code, payload
-
-    def _form(method, url, *, headers=None, fields=None, timeout=20):
-        hdrs = {"Content-Type": "application/x-www-form-urlencoded"}
-        if headers:
-            hdrs.update(headers)
-        data = parse.urlencode(fields or {}).encode()
-        req = request.Request(url, data=data, headers=hdrs, method=method)
-        try:
-            with request.urlopen(req, timeout=timeout) as resp:
-                raw = resp.read()
-                payload = json.loads(raw.decode()) if raw else {}
-                return resp.status, payload
-        except error.HTTPError as e:
-            raw = e.read()
-            try:
-                payload = json.loads(raw.decode()) if raw else {}
-            except Exception:
-                payload = {}
-            return e.code, payload
-
-    _admin_token = None
-    _admin_token_at = 0.0
-
-    def _mas_admin_access_token(now):
-        global _admin_token, _admin_token_at
-        if _admin_token and (now - _admin_token_at) < 300:
-            return _admin_token
-
-        with open(MAS_ADMIN_CLIENT_SECRET_FILE, encoding="utf-8") as fh:
-            client_secret = fh.read().strip()
-        basic = base64.b64encode(f"{MAS_ADMIN_CLIENT_ID}:{client_secret}".encode()).decode()
-
-        status, payload = _form(
-            "POST",
-            f"{MAS_BASE}/oauth2/token",
-            headers={"Authorization": f"Basic {basic}"},
-            fields={"grant_type": "client_credentials", "scope": MAS_ADMIN_SCOPE},
-            timeout=20,
-        )
-        if status != 200 or "access_token" not in payload:
-            raise RuntimeError("mas_admin_token_failed")
-
-        _admin_token = payload["access_token"]
-        _admin_token_at = now
-        return _admin_token
-
-    def _generate_localpart():
-        return "guest-" + secrets.token_hex(6)
-
-    def _generate_displayname():
-        return f"{random.choice(ADJ)}-{random.choice(NOUN)}"
-
-    def _admin_api(admin_token, method, path, body=None):
-        return _json(
-            method,
-            f"{MAS_ADMIN_API_BASE}{path}",
-            headers={"Authorization": f"Bearer {admin_token}"},
-            body=body,
-            timeout=20,
-        )
-
-    def _create_user(admin_token, username):
-        status, payload = _admin_api(admin_token, "POST", "/users", {"username": username})
-        if status != 201:
-            return status, None
-        user = payload.get("data") or {}
-        return status, user.get("id")
-
-    def _set_password(admin_token, user_id, password):
-        status, _payload = _admin_api(
-            admin_token,
-            "POST",
-            f"/users/{parse.quote(user_id)}/set-password",
-            {"password": password},
-        )
-        return status in (200, 204)
-
-    def _login_password(username, password):
-        payload = {
-            "type": "m.login.password",
-            "identifier": {"type": "m.id.user", "user": f"@{username}:{SERVER_NAME}"},
-            "password": password,
-        }
-        status, data = _json(
-            "POST",
-            f"{MAS_BASE}/_matrix/client/v3/login",
-            body=payload,
-            timeout=20,
-        )
-        if status != 200:
-            return None, None
-        return data.get("access_token"), data.get("device_id")
-
-    def _set_display_name(access_token, user_id, displayname):
-        _json(
-            "PUT",
-            f"{SYNAPSE_BASE}/_matrix/client/v3/profile/{parse.quote(user_id, safe='')}/displayname",
-            headers={"Authorization": f"Bearer {access_token}"},
-            body={"displayname": displayname},
-            timeout=20,
-        )
-
-    def _rate_check(ip, now):
-        win, cnt = _rate.get(ip, (now, 0))
-        if now - win > RATE_WINDOW_SEC:
-            _rate[ip] = (now, 1)
-            return True
-        if cnt >= RATE_MAX:
-            return False
-        _rate[ip] = (win, cnt + 1)
-        return True
-
-    class Handler(BaseHTTPRequestHandler):
-        server_version = "matrix-guest-register"
-
-        def _send_json(self, code, payload):
-            body = json.dumps(payload).encode()
-            self.send_response(code)
-            self.send_header("Content-Type", "application/json")
-            self.send_header("Access-Control-Allow-Origin", "*")
-            self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
-            self.send_header("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With")
-            self.send_header("Content-Length", str(len(body)))
-            self.end_headers()
-            self.wfile.write(body)
-
-        def do_OPTIONS(self):  # noqa: N802
-            self.send_response(204)
-            self.send_header("Access-Control-Allow-Origin", "*")
-            self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
-            self.send_header("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With")
-            self.end_headers()
-
-        def do_GET(self):  # noqa: N802
-            parsed = parse.urlparse(self.path)
-            if parsed.path in ("/healthz", "/"):
-                return self._send_json(200, {"ok": True})
-            if parsed.path in ("/_matrix/client/v3/register", "/_matrix/client/r0/register"):
-                return self._send_json(200, {"flows": [{"stages": []}]})
-            return self._send_json(404, {"errcode": "M_NOT_FOUND", "error": "not_found"})
-
-        def do_POST(self):  # noqa: N802
-            parsed = parse.urlparse(self.path)
-            if parsed.path not in ("/_matrix/client/v3/register", "/_matrix/client/r0/register"):
-                return self._send_json(404, {"errcode": "M_NOT_FOUND", "error": "not_found"})
-
-            qs = parse.parse_qs(parsed.query)
-            kind = (qs.get("kind") or ["user"])[0]
-            if kind != "guest":
-                return self._send_json(
-                    403,
-                    {
-                        "errcode": "M_FORBIDDEN",
-                        "error": "Registration is disabled; use https://bstein.dev/request-access for accounts.",
-                    },
-                )
-
-            xfwd = self.headers.get("x-forwarded-for", "")
-            ip = (xfwd.split(",")[0].strip() if xfwd else "") or self.client_address[0]
-            now = __import__("time").time()
-            if not _rate_check(ip, now):
-                return self._send_json(429, {"errcode": "M_LIMIT_EXCEEDED", "error": "rate_limited"})
-
-            length = int(self.headers.get("content-length", "0") or "0")
-            raw = self.rfile.read(length) if length else b"{}"
-            try:
-                body = json.loads(raw.decode()) if raw else {}
-                if not isinstance(body, dict):
-                    body = {}
-            except Exception:
-                body = {}
-            try:
-                admin_token = _mas_admin_access_token(now)
-                displayname = _generate_displayname()
-
-                localpart = None
-                mas_user_id = None
-                for _ in range(5):
-                    localpart = _generate_localpart()
-                    status, mas_user_id = _create_user(admin_token, localpart)
-                    if status == 201 and mas_user_id:
-                        break
-                    mas_user_id = None
-                if not mas_user_id or not localpart:
-                    raise RuntimeError("add_user_failed")
-
-                password = secrets.token_urlsafe(18)
-                if not _set_password(admin_token, mas_user_id, password):
-                    raise RuntimeError("set_password_failed")
-                access_token, device_id = _login_password(localpart, password)
-                if not access_token:
-                    raise RuntimeError("login_failed")
-                try:
-                    _set_display_name(access_token, f"@{localpart}:{SERVER_NAME}", displayname)
-                except Exception:
-                    pass
-            except Exception:
-                return self._send_json(502, {"errcode": "M_UNKNOWN", "error": "guest_provision_failed"})
-
-            resp = {
-                "user_id": f"@{localpart}:{SERVER_NAME}",
-                "access_token": access_token,
-                "device_id": device_id or "guest_device",
-                "home_server": SERVER_NAME,
-            }
-            return self._send_json(200, resp)
-
-    def main():
-        port = int(os.environ.get("PORT", "8080"))
-        HTTPServer(("0.0.0.0", port), Handler).serve_forever()
-
-    if __name__ == "__main__":
-        main()
--- a/services/comms/kustomization.yaml
+++ b/services/comms/kustomization.yaml
@ -9,10 +9,8 @@ resources:
  - livekit-config.yaml
  - element-call-config.yaml
  - element-call-deployment.yaml
-  - guest-register-configmap.yaml
  - guest-register-deployment.yaml
  - guest-register-service.yaml
-  - atlasbot-configmap.yaml
  - atlasbot-deployment.yaml
  - wellknown.yaml
  - atlasbot-rbac.yaml
@ -45,6 +43,36 @@ patches:
  - path: synapse-deployment-strategy-patch.yaml

 configMapGenerator:
+  - name: matrix-guest-register
+    files:
+      - server.py=scripts/guest-register/server.py
+    options:
+      disableNameSuffixHash: true
+  - name: atlasbot
+    files:
+      - bot.py=scripts/atlasbot/bot.py
+    options:
+      disableNameSuffixHash: true
+  - name: othrys-synapse-redis-health
+    files:
+      - ping_readiness_local.sh=scripts/synapse/redis/ping_readiness_local.sh
+      - ping_liveness_local.sh=scripts/synapse/redis/ping_liveness_local.sh
+      - ping_readiness_master.sh=scripts/synapse/redis/ping_readiness_master.sh
+      - ping_liveness_master.sh=scripts/synapse/redis/ping_liveness_master.sh
+      - ping_readiness_local_and_master.sh=scripts/synapse/redis/ping_readiness_local_and_master.sh
+      - ping_liveness_local_and_master.sh=scripts/synapse/redis/ping_liveness_local_and_master.sh
+    options:
+      disableNameSuffixHash: true
+  - name: othrys-synapse-redis-scripts
+    files:
+      - start-master.sh=scripts/synapse/redis/start-master.sh
+    options:
+      disableNameSuffixHash: true
+  - name: othrys-synapse-matrix-synapse-scripts
+    files:
+      - signing-key.sh=scripts/synapse/signing-key.sh
+    options:
+      disableNameSuffixHash: true
  - name: atlas-kb
    files:
      - INDEX.md=knowledge/INDEX.md
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -0,0 +1,622 @@
+import collections
+import json
+import os
+import re
+import ssl
+import time
+from typing import Any
+from urllib import error, parse, request
+
+BASE = os.environ.get("MATRIX_BASE", "http://othrys-synapse-matrix-synapse:8008")
+AUTH_BASE = os.environ.get("AUTH_BASE", "http://matrix-authentication-service:8080")
+USER = os.environ["BOT_USER"]
+PASSWORD = os.environ["BOT_PASS"]
+ROOM_ALIAS = "#othrys:live.bstein.dev"
+
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
+MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
+API_KEY = os.environ.get("CHAT_API_KEY", "")
+
+KB_DIR = os.environ.get("KB_DIR", "")
+VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
+
+BOT_MENTIONS = os.environ.get("BOT_MENTIONS", f"{USER},atlas")
+SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
+
+MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
+MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
+
+TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE)
+HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)")
+STOPWORDS = {
+    "the",
+    "and",
+    "for",
+    "with",
+    "this",
+    "that",
+    "from",
+    "into",
+    "what",
+    "how",
+    "why",
+    "when",
+    "where",
+    "which",
+    "who",
+    "can",
+    "could",
+    "should",
+    "would",
+    "please",
+    "help",
+    "atlas",
+    "othrys",
+}
+
+METRIC_HINT_WORDS = {
+    "health",
+    "status",
+    "down",
+    "slow",
+    "error",
+    "unknown_error",
+    "timeout",
+    "crash",
+    "crashloop",
+    "restart",
+    "restarts",
+    "pending",
+    "unreachable",
+    "latency",
+}
+
+def _tokens(text: str) -> list[str]:
+    toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
+    return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
+
+
+# Mention detection (Matrix rich mentions + plain @atlas).
+MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()]
+MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS]
+MENTION_RE = re.compile(
+    r"(?<!\\w)@(?:" + "|".join(re.escape(m) for m in MENTION_LOCALPARTS) + r")(?:\\:[^\\s]+)?(?!\\w)",
+    re.IGNORECASE,
+)
+
+def normalize_user_id(token: str) -> str:
+    t = token.strip()
+    if not t:
+        return ""
+    if t.startswith("@") and ":" in t:
+        return t
+    t = t.lstrip("@")
+    if ":" in t:
+        return f"@{t}"
+    return f"@{t}:{SERVER_NAME}"
+
+MENTION_USER_IDS = {normalize_user_id(t).lower() for t in MENTION_TOKENS if normalize_user_id(t)}
+
+def is_mentioned(content: dict, body: str) -> bool:
+    if MENTION_RE.search(body or "") is not None:
+        return True
+    mentions = content.get("m.mentions", {})
+    user_ids = mentions.get("user_ids", [])
+    if not isinstance(user_ids, list):
+        return False
+    return any(isinstance(uid, str) and uid.lower() in MENTION_USER_IDS for uid in user_ids)
+
+
+# Matrix HTTP helper.
+def req(method: str, path: str, token: str | None = None, body=None, timeout=60, base: str | None = None):
+    url = (base or BASE) + path
+    data = None
+    headers = {}
+    if body is not None:
+        data = json.dumps(body).encode()
+        headers["Content-Type"] = "application/json"
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    r = request.Request(url, data=data, headers=headers, method=method)
+    with request.urlopen(r, timeout=timeout) as resp:
+        raw = resp.read()
+        return json.loads(raw.decode()) if raw else {}
+
+def login() -> str:
+    login_user = normalize_user_id(USER)
+    payload = {
+        "type": "m.login.password",
+        "identifier": {"type": "m.id.user", "user": login_user},
+        "password": PASSWORD,
+    }
+    res = req("POST", "/_matrix/client/v3/login", body=payload, base=AUTH_BASE)
+    return res["access_token"]
+
+def resolve_alias(token: str, alias: str) -> str:
+    enc = parse.quote(alias)
+    res = req("GET", f"/_matrix/client/v3/directory/room/{enc}", token)
+    return res["room_id"]
+
+def join_room(token: str, room: str):
+    req("POST", f"/_matrix/client/v3/rooms/{parse.quote(room)}/join", token, body={})
+
+def send_msg(token: str, room: str, text: str):
+    path = f"/_matrix/client/v3/rooms/{parse.quote(room)}/send/m.room.message"
+    req("POST", path, token, body={"msgtype": "m.text", "body": text})
+
+
+# Atlas KB loader (no external deps; files are pre-rendered JSON via scripts/knowledge_render_atlas.py).
+KB = {"catalog": {}, "runbooks": []}
+_HOST_INDEX: dict[str, list[dict]] = {}
+_NAME_INDEX: set[str] = set()
+
+def _load_json_file(path: str) -> Any | None:
+    try:
+        with open(path, "rb") as f:
+            return json.loads(f.read().decode("utf-8"))
+    except Exception:
+        return None
+
+def load_kb():
+    global KB, _HOST_INDEX, _NAME_INDEX
+    if not KB_DIR:
+        return
+    catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {}
+    runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or []
+    KB = {"catalog": catalog, "runbooks": runbooks}
+
+    host_index: dict[str, list[dict]] = collections.defaultdict(list)
+    for ep in catalog.get("http_endpoints", []) if isinstance(catalog, dict) else []:
+        host = (ep.get("host") or "").lower()
+        if host:
+            host_index[host].append(ep)
+    _HOST_INDEX = {k: host_index[k] for k in sorted(host_index.keys())}
+
+    names: set[str] = set()
+    for s in catalog.get("services", []) if isinstance(catalog, dict) else []:
+        if isinstance(s, dict) and s.get("name"):
+            names.add(str(s["name"]).lower())
+    for w in catalog.get("workloads", []) if isinstance(catalog, dict) else []:
+        if isinstance(w, dict) and w.get("name"):
+            names.add(str(w["name"]).lower())
+    _NAME_INDEX = names
+
+def kb_retrieve(query: str, *, limit: int = 3) -> str:
+    q = (query or "").strip()
+    if not q or not KB.get("runbooks"):
+        return ""
+    ql = q.lower()
+    q_tokens = _tokens(q)
+    if not q_tokens:
+        return ""
+
+    scored: list[tuple[int, dict]] = []
+    for doc in KB.get("runbooks", []):
+        if not isinstance(doc, dict):
+            continue
+        title = str(doc.get("title") or "")
+        body = str(doc.get("body") or "")
+        tags = doc.get("tags") or []
+        entrypoints = doc.get("entrypoints") or []
+        hay = (title + "\n" + " ".join(tags) + "\n" + " ".join(entrypoints) + "\n" + body).lower()
+        score = 0
+        for t in set(q_tokens):
+            if t in hay:
+                score += 3 if t in title.lower() else 1
+        for h in entrypoints:
+            if isinstance(h, str) and h.lower() in ql:
+                score += 4
+        if score:
+            scored.append((score, doc))
+
+    scored.sort(key=lambda x: x[0], reverse=True)
+    picked = [d for _, d in scored[:limit]]
+    if not picked:
+        return ""
+
+    parts: list[str] = ["Atlas KB (retrieved):"]
+    used = 0
+    for d in picked:
+        path = d.get("path") or ""
+        title = d.get("title") or path
+        body = (d.get("body") or "").strip()
+        snippet = body[:900].strip()
+        chunk = f"- {title} ({path})\n{snippet}"
+        if used + len(chunk) > MAX_KB_CHARS:
+            break
+        parts.append(chunk)
+        used += len(chunk)
+    return "\n".join(parts).strip()
+
+def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
+    q = (query or "").strip()
+    if not q or not KB.get("catalog"):
+        return "", []
+    ql = q.lower()
+    hosts = {m.group(1).lower() for m in HOST_RE.finditer(ql) if m.group(1).lower().endswith("bstein.dev")}
+
+    # Also match by known workload/service names.
+    for t in _tokens(ql):
+        if t in _NAME_INDEX:
+            hosts |= {ep["host"].lower() for ep in KB["catalog"].get("http_endpoints", []) if isinstance(ep, dict) and ep.get("backend", {}).get("service") == t}
+
+    edges: list[tuple[str, str]] = []
+    lines: list[str] = []
+    for host in sorted(hosts):
+        for ep in _HOST_INDEX.get(host, []):
+            backend = ep.get("backend") or {}
+            ns = backend.get("namespace") or ""
+            svc = backend.get("service") or ""
+            path = ep.get("path") or "/"
+            if not svc:
+                continue
+            wk = backend.get("workloads") or []
+            wk_str = ", ".join(f"{w.get('kind')}:{w.get('name')}" for w in wk if isinstance(w, dict) and w.get("name")) or "unknown"
+            lines.append(f"- {host}{path} → {ns}/{svc} → {wk_str}")
+            for w in wk:
+                if isinstance(w, dict) and w.get("name"):
+                    edges.append((ns, str(w["name"])))
+    if not lines:
+        return "", []
+    return "Atlas endpoints (from GitOps):\n" + "\n".join(lines[:20]), edges
+
+
+# Kubernetes API (read-only). RBAC is provided via ServiceAccount atlasbot.
+_K8S_TOKEN: str | None = None
+_K8S_CTX: ssl.SSLContext | None = None
+
+def _k8s_context() -> ssl.SSLContext:
+    global _K8S_CTX
+    if _K8S_CTX is not None:
+        return _K8S_CTX
+    ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+    ctx = ssl.create_default_context(cafile=ca_path)
+    _K8S_CTX = ctx
+    return ctx
+
+def _k8s_token() -> str:
+    global _K8S_TOKEN
+    if _K8S_TOKEN:
+        return _K8S_TOKEN
+    token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+    with open(token_path, "r", encoding="utf-8") as f:
+        _K8S_TOKEN = f.read().strip()
+    return _K8S_TOKEN
+
+def k8s_get(path: str, timeout: int = 8) -> dict:
+    host = os.environ.get("KUBERNETES_SERVICE_HOST")
+    port = os.environ.get("KUBERNETES_SERVICE_PORT_HTTPS") or os.environ.get("KUBERNETES_SERVICE_PORT") or "443"
+    if not host:
+        raise RuntimeError("k8s host missing")
+    url = f"https://{host}:{port}{path}"
+    headers = {"Authorization": f"Bearer {_k8s_token()}"}
+    r = request.Request(url, headers=headers, method="GET")
+    with request.urlopen(r, timeout=timeout, context=_k8s_context()) as resp:
+        raw = resp.read()
+        return json.loads(raw.decode()) if raw else {}
+
+def k8s_pods(namespace: str) -> list[dict]:
+    data = k8s_get(f"/api/v1/namespaces/{parse.quote(namespace)}/pods?limit=500")
+    items = data.get("items") or []
+    return items if isinstance(items, list) else []
+
+def summarize_pods(namespace: str, prefixes: set[str] | None = None) -> str:
+    try:
+        pods = k8s_pods(namespace)
+    except Exception:
+        return ""
+    out: list[str] = []
+    for p in pods:
+        md = p.get("metadata") or {}
+        st = p.get("status") or {}
+        name = md.get("name") or ""
+        if prefixes and not any(name.startswith(pref + "-") or name == pref or name.startswith(pref) for pref in prefixes):
+            continue
+        phase = st.get("phase") or "?"
+        cs = st.get("containerStatuses") or []
+        restarts = 0
+        ready = 0
+        total = 0
+        reason = st.get("reason") or ""
+        for c in cs if isinstance(cs, list) else []:
+            if not isinstance(c, dict):
+                continue
+            total += 1
+            restarts += int(c.get("restartCount") or 0)
+            if c.get("ready"):
+                ready += 1
+            state = c.get("state") or {}
+            if not reason and isinstance(state, dict):
+                waiting = state.get("waiting") or {}
+                if isinstance(waiting, dict) and waiting.get("reason"):
+                    reason = waiting.get("reason")
+        extra = f" ({reason})" if reason else ""
+        out.append(f"- {namespace}/{name}: {phase} {ready}/{total} restarts={restarts}{extra}")
+    return "\n".join(out[:20])
+
+def flux_not_ready() -> str:
+    try:
+        data = k8s_get(
+            "/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations?limit=200"
+        )
+    except Exception:
+        return ""
+    items = data.get("items") or []
+    bad: list[str] = []
+    for it in items if isinstance(items, list) else []:
+        md = it.get("metadata") or {}
+        st = it.get("status") or {}
+        name = md.get("name") or ""
+        conds = st.get("conditions") or []
+        ready = None
+        msg = ""
+        for c in conds if isinstance(conds, list) else []:
+            if isinstance(c, dict) and c.get("type") == "Ready":
+                ready = c.get("status")
+                msg = c.get("message") or ""
+        if ready not in ("True", True):
+            bad.append(f"- flux kustomization/{name}: Ready={ready} {msg}".strip())
+    return "\n".join(bad[:10])
+
+
+# VictoriaMetrics (PromQL) helpers.
+def vm_query(query: str, timeout: int = 8) -> dict | None:
+    try:
+        url = VM_URL.rstrip("/") + "/api/v1/query?" + parse.urlencode({"query": query})
+        with request.urlopen(url, timeout=timeout) as resp:
+            return json.loads(resp.read().decode())
+    except Exception:
+        return None
+
+def _vm_value_series(res: dict) -> list[dict]:
+    if not res or (res.get("status") != "success"):
+        return []
+    data = res.get("data") or {}
+    result = data.get("result") or []
+    return result if isinstance(result, list) else []
+
+def vm_render_result(res: dict | None, limit: int = 12) -> str:
+    if not res:
+        return ""
+    series = _vm_value_series(res)
+    if not series:
+        return ""
+    out: list[str] = []
+    for r in series[:limit]:
+        if not isinstance(r, dict):
+            continue
+        metric = r.get("metric") or {}
+        value = r.get("value") or []
+        val = value[1] if isinstance(value, list) and len(value) > 1 else ""
+        # Prefer common labels if present.
+        label_parts = []
+        for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"):
+            if isinstance(metric, dict) and metric.get(k):
+                label_parts.append(f"{k}={metric.get(k)}")
+        if not label_parts and isinstance(metric, dict):
+            for k in sorted(metric.keys()):
+                if k.startswith("__"):
+                    continue
+                label_parts.append(f"{k}={metric.get(k)}")
+                if len(label_parts) >= 4:
+                    break
+        labels = ", ".join(label_parts) if label_parts else "series"
+        out.append(f"- {labels}: {val}")
+    return "\n".join(out)
+
+def vm_top_restarts(hours: int = 1) -> str:
+    q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
+    res = vm_query(q)
+    if not res or (res.get("status") != "success"):
+        return ""
+    out: list[str] = []
+    for r in (res.get("data") or {}).get("result") or []:
+        if not isinstance(r, dict):
+            continue
+        m = r.get("metric") or {}
+        v = r.get("value") or []
+        ns = (m.get("namespace") or "").strip()
+        pod = (m.get("pod") or "").strip()
+        val = v[1] if isinstance(v, list) and len(v) > 1 else ""
+        if pod:
+            out.append(f"- restarts({hours}h): {ns}/{pod} = {val}")
+    return "\n".join(out)
+
+def vm_cluster_snapshot() -> str:
+    parts: list[str] = []
+    # Node readiness (kube-state-metrics).
+    ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="true"})')
+    not_ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="false"})')
+    if ready and not_ready:
+        try:
+            r = _vm_value_series(ready)[0]["value"][1]
+            nr = _vm_value_series(not_ready)[0]["value"][1]
+            parts.append(f"- nodes ready: {r} (not ready: {nr})")
+        except Exception:
+            pass
+
+    phases = vm_query("sum by (phase) (kube_pod_status_phase)")
+    pr = vm_render_result(phases, limit=8)
+    if pr:
+        parts.append("Pod phases:")
+        parts.append(pr)
+    return "\n".join(parts).strip()
+
+
+# Conversation state.
+history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (short transcript)
+
+def key_for(room_id: str, sender: str, is_dm: bool):
+    return (room_id, None) if is_dm else (room_id, sender)
+
+def build_context(prompt: str, *, allow_tools: bool, targets: list[tuple[str, str]]) -> str:
+    parts: list[str] = []
+
+    kb = kb_retrieve(prompt)
+    if kb:
+        parts.append(kb)
+
+    endpoints, edges = catalog_hints(prompt)
+    if endpoints:
+        parts.append(endpoints)
+
+    if allow_tools:
+        # Scope pod summaries to relevant namespaces/workloads when possible.
+        prefixes_by_ns: dict[str, set[str]] = collections.defaultdict(set)
+        for ns, name in (targets or []) + (edges or []):
+            if ns and name:
+                prefixes_by_ns[ns].add(name)
+        pod_lines: list[str] = []
+        for ns in sorted(prefixes_by_ns.keys()):
+            summary = summarize_pods(ns, prefixes_by_ns[ns])
+            if summary:
+                pod_lines.append(f"Pods (live):\n{summary}")
+        if pod_lines:
+            parts.append("\n".join(pod_lines)[:MAX_TOOL_CHARS])
+
+        flux_bad = flux_not_ready()
+        if flux_bad:
+            parts.append("Flux (not ready):\n" + flux_bad)
+
+        p_l = (prompt or "").lower()
+        if any(w in p_l for w in METRIC_HINT_WORDS):
+            restarts = vm_top_restarts(1)
+            if restarts:
+                parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
+            snap = vm_cluster_snapshot()
+            if snap:
+                parts.append("VictoriaMetrics (cluster snapshot):\n" + snap)
+
+    return "\n\n".join([p for p in parts if p]).strip()
+
+def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
+    try:
+        system = (
+            "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
+            "Be helpful, direct, and concise. "
+            "Prefer answering with exact repo paths and Kubernetes resource names. "
+            "Never include or request secret values."
+        )
+        transcript_parts = [system]
+        if context:
+            transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
+        transcript_parts.extend(history[hist_key][-24:])
+        transcript_parts.append(f"User: {prompt}")
+        transcript = "\n".join(transcript_parts)
+
+        payload = {"model": MODEL, "message": transcript}
+        headers = {"Content-Type": "application/json"}
+        if API_KEY:
+            headers["x-api-key"] = API_KEY
+        r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
+        with request.urlopen(r, timeout=20) as resp:
+            data = json.loads(resp.read().decode())
+            reply = data.get("message") or data.get("response") or data.get("reply") or "I'm here to help."
+            history[hist_key].append(f"Atlas: {reply}")
+            return reply
+    except Exception:
+        return "I’m here — but I couldn’t reach the model backend."
+
+def sync_loop(token: str, room_id: str):
+    since = None
+    try:
+        res = req("GET", "/_matrix/client/v3/sync?timeout=0", token, timeout=10)
+        since = res.get("next_batch")
+    except Exception:
+        pass
+
+    while True:
+        params = {"timeout": 30000}
+        if since:
+            params["since"] = since
+        query = parse.urlencode(params)
+        try:
+            res = req("GET", f"/_matrix/client/v3/sync?{query}", token, timeout=35)
+        except Exception:
+            time.sleep(5)
+            continue
+        since = res.get("next_batch", since)
+
+        # invites
+        for rid, data in res.get("rooms", {}).get("invite", {}).items():
+            try:
+                join_room(token, rid)
+            except Exception:
+                pass
+
+        # messages
+        for rid, data in res.get("rooms", {}).get("join", {}).items():
+            timeline = data.get("timeline", {}).get("events", [])
+            joined_count = data.get("summary", {}).get("m.joined_member_count")
+            is_dm = joined_count is not None and joined_count <= 2
+
+            for ev in timeline:
+                if ev.get("type") != "m.room.message":
+                    continue
+                content = ev.get("content", {})
+                body = (content.get("body", "") or "").strip()
+                if not body:
+                    continue
+                sender = ev.get("sender", "")
+                if sender == f"@{USER}:live.bstein.dev":
+                    continue
+
+                mentioned = is_mentioned(content, body)
+                hist_key = key_for(rid, sender, is_dm)
+                history[hist_key].append(f"{sender}: {body}")
+                history[hist_key] = history[hist_key][-80:]
+
+                if not (is_dm or mentioned):
+                    continue
+
+                # Only do live cluster/metrics introspection in DMs.
+                allow_tools = is_dm
+
+                promql = ""
+                if allow_tools:
+                    m = re.match(r"(?is)^\\s*promql\\s*(?:\\:|\\s)\\s*(.+?)\\s*$", body)
+                    if m:
+                        promql = m.group(1).strip()
+
+                # Attempt to scope tools to the most likely workloads when hostnames are mentioned.
+                targets: list[tuple[str, str]] = []
+                for m in HOST_RE.finditer(body.lower()):
+                    host = m.group(1).lower()
+                    for ep in _HOST_INDEX.get(host, []):
+                        backend = ep.get("backend") or {}
+                        ns = backend.get("namespace") or ""
+                        for w in backend.get("workloads") or []:
+                            if isinstance(w, dict) and w.get("name"):
+                                targets.append((ns, str(w["name"])))
+
+                context = build_context(body, allow_tools=allow_tools, targets=targets)
+                if allow_tools and promql:
+                    res = vm_query(promql, timeout=20)
+                    rendered = vm_render_result(res, limit=15) or "(no results)"
+                    extra = "VictoriaMetrics (PromQL result):\n" + rendered
+                    context = (context + "\n\n" + extra).strip() if context else extra
+                reply = ollama_reply(hist_key, body, context=context)
+                send_msg(token, rid, reply)
+
+def login_with_retry():
+    last_err = None
+    for attempt in range(10):
+        try:
+            return login()
+        except Exception as exc:  # noqa: BLE001
+            last_err = exc
+            time.sleep(min(30, 2 ** attempt))
+    raise last_err
+
+def main():
+    load_kb()
+    token = login_with_retry()
+    try:
+        room_id = resolve_alias(token, ROOM_ALIAS)
+        join_room(token, room_id)
+    except Exception:
+        room_id = None
+    sync_loop(token, room_id)
+
+if __name__ == "__main__":
+    main()
--- a/services/comms/scripts/guest-register/server.py
+++ b/services/comms/scripts/guest-register/server.py
@ -0,0 +1,264 @@
+import base64
+import json
+import os
+import random
+import secrets
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from urllib import error, parse, request
+
+MAS_BASE = os.environ.get("MAS_BASE", "http://matrix-authentication-service:8080").rstrip("/")
+MAS_ADMIN_API_BASE = os.environ.get("MAS_ADMIN_API_BASE", "http://matrix-authentication-service:8081/api/admin/v1").rstrip("/")
+SYNAPSE_BASE = os.environ.get("SYNAPSE_BASE", "http://othrys-synapse-matrix-synapse:8008").rstrip("/")
+SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
+
+MAS_ADMIN_CLIENT_ID = os.environ["MAS_ADMIN_CLIENT_ID"]
+MAS_ADMIN_CLIENT_SECRET_FILE = os.environ.get("MAS_ADMIN_CLIENT_SECRET_FILE", "/etc/mas/admin-client/client_secret")
+MAS_ADMIN_SCOPE = os.environ.get("MAS_ADMIN_SCOPE", "urn:mas:admin")
+RATE_WINDOW_SEC = int(os.environ.get("RATE_WINDOW_SEC", "60"))
+RATE_MAX = int(os.environ.get("RATE_MAX", "30"))
+_rate = {}  # ip -> [window_start, count]
+
+ADJ = [
+    "brisk","calm","eager","gentle","merry","nifty","rapid","sunny","witty","zesty",
+    "amber","bold","bright","crisp","daring","frosty","glad","jolly","lively","mellow",
+    "quiet","ripe","serene","spry","tidy","vivid","warm","wild","clever","kind",
+]
+NOUN = [
+    "otter","falcon","comet","ember","grove","harbor","meadow","raven","river","summit",
+    "breeze","cedar","cinder","cove","delta","forest","glade","lark","marsh","peak",
+    "pine","quartz","reef","ridge","sable","sage","shore","thunder","vale","zephyr",
+]
+
+def _json(method, url, *, headers=None, body=None, timeout=20):
+    hdrs = {"Content-Type": "application/json"}
+    if headers:
+        hdrs.update(headers)
+    data = None
+    if body is not None:
+        data = json.dumps(body).encode()
+    req = request.Request(url, data=data, headers=hdrs, method=method)
+    try:
+        with request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read()
+            payload = json.loads(raw.decode()) if raw else {}
+            return resp.status, payload
+    except error.HTTPError as e:
+        raw = e.read()
+        try:
+            payload = json.loads(raw.decode()) if raw else {}
+        except Exception:
+            payload = {}
+        return e.code, payload
+
+def _form(method, url, *, headers=None, fields=None, timeout=20):
+    hdrs = {"Content-Type": "application/x-www-form-urlencoded"}
+    if headers:
+        hdrs.update(headers)
+    data = parse.urlencode(fields or {}).encode()
+    req = request.Request(url, data=data, headers=hdrs, method=method)
+    try:
+        with request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read()
+            payload = json.loads(raw.decode()) if raw else {}
+            return resp.status, payload
+    except error.HTTPError as e:
+        raw = e.read()
+        try:
+            payload = json.loads(raw.decode()) if raw else {}
+        except Exception:
+            payload = {}
+        return e.code, payload
+
+_admin_token = None
+_admin_token_at = 0.0
+
+def _mas_admin_access_token(now):
+    global _admin_token, _admin_token_at
+    if _admin_token and (now - _admin_token_at) < 300:
+        return _admin_token
+
+    with open(MAS_ADMIN_CLIENT_SECRET_FILE, encoding="utf-8") as fh:
+        client_secret = fh.read().strip()
+    basic = base64.b64encode(f"{MAS_ADMIN_CLIENT_ID}:{client_secret}".encode()).decode()
+
+    status, payload = _form(
+        "POST",
+        f"{MAS_BASE}/oauth2/token",
+        headers={"Authorization": f"Basic {basic}"},
+        fields={"grant_type": "client_credentials", "scope": MAS_ADMIN_SCOPE},
+        timeout=20,
+    )
+    if status != 200 or "access_token" not in payload:
+        raise RuntimeError("mas_admin_token_failed")
+
+    _admin_token = payload["access_token"]
+    _admin_token_at = now
+    return _admin_token
+
+def _generate_localpart():
+    return "guest-" + secrets.token_hex(6)
+
+def _generate_displayname():
+    return f"{random.choice(ADJ)}-{random.choice(NOUN)}"
+
+def _admin_api(admin_token, method, path, body=None):
+    return _json(
+        method,
+        f"{MAS_ADMIN_API_BASE}{path}",
+        headers={"Authorization": f"Bearer {admin_token}"},
+        body=body,
+        timeout=20,
+    )
+
+def _create_user(admin_token, username):
+    status, payload = _admin_api(admin_token, "POST", "/users", {"username": username})
+    if status != 201:
+        return status, None
+    user = payload.get("data") or {}
+    return status, user.get("id")
+
+def _set_password(admin_token, user_id, password):
+    status, _payload = _admin_api(
+        admin_token,
+        "POST",
+        f"/users/{parse.quote(user_id)}/set-password",
+        {"password": password},
+    )
+    return status in (200, 204)
+
+def _login_password(username, password):
+    payload = {
+        "type": "m.login.password",
+        "identifier": {"type": "m.id.user", "user": f"@{username}:{SERVER_NAME}"},
+        "password": password,
+    }
+    status, data = _json(
+        "POST",
+        f"{MAS_BASE}/_matrix/client/v3/login",
+        body=payload,
+        timeout=20,
+    )
+    if status != 200:
+        return None, None
+    return data.get("access_token"), data.get("device_id")
+
+def _set_display_name(access_token, user_id, displayname):
+    _json(
+        "PUT",
+        f"{SYNAPSE_BASE}/_matrix/client/v3/profile/{parse.quote(user_id, safe='')}/displayname",
+        headers={"Authorization": f"Bearer {access_token}"},
+        body={"displayname": displayname},
+        timeout=20,
+    )
+
+def _rate_check(ip, now):
+    win, cnt = _rate.get(ip, (now, 0))
+    if now - win > RATE_WINDOW_SEC:
+        _rate[ip] = (now, 1)
+        return True
+    if cnt >= RATE_MAX:
+        return False
+    _rate[ip] = (win, cnt + 1)
+    return True
+
+class Handler(BaseHTTPRequestHandler):
+    server_version = "matrix-guest-register"
+
+    def _send_json(self, code, payload):
+        body = json.dumps(payload).encode()
+        self.send_response(code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Access-Control-Allow-Origin", "*")
+        self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
+        self.send_header("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def do_OPTIONS(self):  # noqa: N802
+        self.send_response(204)
+        self.send_header("Access-Control-Allow-Origin", "*")
+        self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
+        self.send_header("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Requested-With")
+        self.end_headers()
+
+    def do_GET(self):  # noqa: N802
+        parsed = parse.urlparse(self.path)
+        if parsed.path in ("/healthz", "/"):
+            return self._send_json(200, {"ok": True})
+        if parsed.path in ("/_matrix/client/v3/register", "/_matrix/client/r0/register"):
+            return self._send_json(200, {"flows": [{"stages": []}]})
+        return self._send_json(404, {"errcode": "M_NOT_FOUND", "error": "not_found"})
+
+    def do_POST(self):  # noqa: N802
+        parsed = parse.urlparse(self.path)
+        if parsed.path not in ("/_matrix/client/v3/register", "/_matrix/client/r0/register"):
+            return self._send_json(404, {"errcode": "M_NOT_FOUND", "error": "not_found"})
+
+        qs = parse.parse_qs(parsed.query)
+        kind = (qs.get("kind") or ["user"])[0]
+        if kind != "guest":
+            return self._send_json(
+                403,
+                {
+                    "errcode": "M_FORBIDDEN",
+                    "error": "Registration is disabled; use https://bstein.dev/request-access for accounts.",
+                },
+            )
+
+        xfwd = self.headers.get("x-forwarded-for", "")
+        ip = (xfwd.split(",")[0].strip() if xfwd else "") or self.client_address[0]
+        now = __import__("time").time()
+        if not _rate_check(ip, now):
+            return self._send_json(429, {"errcode": "M_LIMIT_EXCEEDED", "error": "rate_limited"})
+
+        length = int(self.headers.get("content-length", "0") or "0")
+        raw = self.rfile.read(length) if length else b"{}"
+        try:
+            body = json.loads(raw.decode()) if raw else {}
+            if not isinstance(body, dict):
+                body = {}
+        except Exception:
+            body = {}
+        try:
+            admin_token = _mas_admin_access_token(now)
+            displayname = _generate_displayname()
+
+            localpart = None
+            mas_user_id = None
+            for _ in range(5):
+                localpart = _generate_localpart()
+                status, mas_user_id = _create_user(admin_token, localpart)
+                if status == 201 and mas_user_id:
+                    break
+                mas_user_id = None
+            if not mas_user_id or not localpart:
+                raise RuntimeError("add_user_failed")
+
+            password = secrets.token_urlsafe(18)
+            if not _set_password(admin_token, mas_user_id, password):
+                raise RuntimeError("set_password_failed")
+            access_token, device_id = _login_password(localpart, password)
+            if not access_token:
+                raise RuntimeError("login_failed")
+            try:
+                _set_display_name(access_token, f"@{localpart}:{SERVER_NAME}", displayname)
+            except Exception:
+                pass
+        except Exception:
+            return self._send_json(502, {"errcode": "M_UNKNOWN", "error": "guest_provision_failed"})
+
+        resp = {
+            "user_id": f"@{localpart}:{SERVER_NAME}",
+            "access_token": access_token,
+            "device_id": device_id or "guest_device",
+            "home_server": SERVER_NAME,
+        }
+        return self._send_json(200, resp)
+
+def main():
+    port = int(os.environ.get("PORT", "8080"))
+    HTTPServer(("0.0.0.0", port), Handler).serve_forever()
+
+if __name__ == "__main__":
+    main()
--- a/services/comms/scripts/synapse/redis/ping_liveness_local.sh
+++ b/services/comms/scripts/synapse/redis/ping_liveness_local.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
+[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD"
+response=$(
+  timeout -s 15 $1 \
+  redis-cli \
+    -h localhost \
+    -p $REDIS_PORT \
+    ping
+)
+if [ "$?" -eq "124" ]; then
+  echo "Timed out"
+  exit 1
+fi
+responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}')
+if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ] && [ "$responseFirstWord" != "MASTERDOWN" ]; then
+  echo "$response"
+  exit 1
+fi
--- a/services/comms/scripts/synapse/redis/ping_liveness_local_and_master.sh
+++ b/services/comms/scripts/synapse/redis/ping_liveness_local_and_master.sh
@ -0,0 +1,5 @@
+script_dir="$(dirname "$0")"
+exit_status=0
+"$script_dir/ping_liveness_local.sh" $1 || exit_status=$?
+"$script_dir/ping_liveness_master.sh" $1 || exit_status=$?
+exit $exit_status
--- a/services/comms/scripts/synapse/redis/ping_liveness_master.sh
+++ b/services/comms/scripts/synapse/redis/ping_liveness_master.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")"
+[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD"
+response=$(
+  timeout -s 15 $1 \
+  redis-cli \
+    -h $REDIS_MASTER_HOST \
+    -p $REDIS_MASTER_PORT_NUMBER \
+    ping
+)
+if [ "$?" -eq "124" ]; then
+  echo "Timed out"
+  exit 1
+fi
+responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}')
+if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ]; then
+  echo "$response"
+  exit 1
+fi
--- a/services/comms/scripts/synapse/redis/ping_readiness_local.sh
+++ b/services/comms/scripts/synapse/redis/ping_readiness_local.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
+[[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD"
+response=$(
+  timeout -s 15 $1 \
+  redis-cli \
+    -h localhost \
+    -p $REDIS_PORT \
+    ping
+)
+if [ "$?" -eq "124" ]; then
+  echo "Timed out"
+  exit 1
+fi
+if [ "$response" != "PONG" ]; then
+  echo "$response"
+  exit 1
+fi
--- a/services/comms/scripts/synapse/redis/ping_readiness_local_and_master.sh
+++ b/services/comms/scripts/synapse/redis/ping_readiness_local_and_master.sh
@ -0,0 +1,5 @@
+script_dir="$(dirname "$0")"
+exit_status=0
+"$script_dir/ping_readiness_local.sh" $1 || exit_status=$?
+"$script_dir/ping_readiness_master.sh" $1 || exit_status=$?
+exit $exit_status
--- a/services/comms/scripts/synapse/redis/ping_readiness_master.sh
+++ b/services/comms/scripts/synapse/redis/ping_readiness_master.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+[[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")"
+[[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD"
+response=$(
+  timeout -s 15 $1 \
+  redis-cli \
+    -h $REDIS_MASTER_HOST \
+    -p $REDIS_MASTER_PORT_NUMBER \
+    ping
+)
+if [ "$?" -eq "124" ]; then
+  echo "Timed out"
+  exit 1
+fi
+if [ "$response" != "PONG" ]; then
+  echo "$response"
+  exit 1
+fi
--- a/services/comms/scripts/synapse/redis/start-master.sh
+++ b/services/comms/scripts/synapse/redis/start-master.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+[[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
+if [[ -f /opt/bitnami/redis/mounted-etc/master.conf ]];then
+    cp /opt/bitnami/redis/mounted-etc/master.conf /opt/bitnami/redis/etc/master.conf
+fi
+if [[ -f /opt/bitnami/redis/mounted-etc/redis.conf ]];then
+    cp /opt/bitnami/redis/mounted-etc/redis.conf /opt/bitnami/redis/etc/redis.conf
+fi
+ARGS=("--port" "${REDIS_PORT}")
+ARGS+=("--requirepass" "${REDIS_PASSWORD}")
+ARGS+=("--masterauth" "${REDIS_PASSWORD}")
+ARGS+=("--include" "/opt/bitnami/redis/etc/redis.conf")
+ARGS+=("--include" "/opt/bitnami/redis/etc/master.conf")
+exec redis-server "${ARGS[@]}"
--- a/services/comms/scripts/synapse/signing-key.sh
+++ b/services/comms/scripts/synapse/signing-key.sh
@ -0,0 +1,41 @@
+#!/bin/sh
+
+set -eu
+
+check_key() {
+  set +e
+
+  echo "Checking for existing signing key..."
+  key="$(kubectl get secret "$SECRET_NAME" -o jsonpath="{.data['signing\.key']}" 2> /dev/null)"
+  [ $? -ne 0 ] && return 1
+  [ -z "$key" ] && return 2
+  return 0
+}
+
+create_key() {
+  echo "Waiting for new signing key to be generated..."
+  begin=$(date +%s)
+  end=$((begin + 300)) # 5 minutes
+  while true; do
+    [ -f /synapse/keys/signing.key ] && return 0
+    [ "$(date +%s)" -gt $end ] && return 1
+    sleep 5
+  done
+}
+
+store_key() {
+  echo "Storing signing key in Kubernetes secret..."
+  kubectl patch secret "$SECRET_NAME" -p "{\"data\":{\"signing.key\":\"$(base64 /synapse/keys/signing.key | tr -d '\n')\"}}"
+}
+
+if check_key; then
+  echo "Key already in place, exiting."
+  exit
+fi
+
+if ! create_key; then
+  echo "Timed out waiting for a signing key to appear."
+  exit 1
+fi
+
+store_key
--- a/services/comms/synapse-rendered.yaml
+++ b/services/comms/synapse-rendered.yaml
@ -82,140 +82,6 @@ data:
    rename-command FLUSHALL ""
    # End of replica configuration
 ---
-# Source: matrix-synapse/charts/redis/templates/health-configmap.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: othrys-synapse-redis-health
-  labels:
-    app.kubernetes.io/instance: othrys-synapse
-    app.kubernetes.io/managed-by: Helm
-    app.kubernetes.io/name: redis
-    helm.sh/chart: redis-17.17.1
-data:
-  ping_readiness_local.sh: |-
-    #!/bin/bash
-
-    [[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
-    [[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD"
-    response=$(
-      timeout -s 15 $1 \
-      redis-cli \
-        -h localhost \
-        -p $REDIS_PORT \
-        ping
-    )
-    if [ "$?" -eq "124" ]; then
-      echo "Timed out"
-      exit 1
-    fi
-    if [ "$response" != "PONG" ]; then
-      echo "$response"
-      exit 1
-    fi
-  ping_liveness_local.sh: |-
-    #!/bin/bash
-
-    [[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
-    [[ -n "$REDIS_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_PASSWORD"
-    response=$(
-      timeout -s 15 $1 \
-      redis-cli \
-        -h localhost \
-        -p $REDIS_PORT \
-        ping
-    )
-    if [ "$?" -eq "124" ]; then
-      echo "Timed out"
-      exit 1
-    fi
-    responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}')
-    if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ] && [ "$responseFirstWord" != "MASTERDOWN" ]; then
-      echo "$response"
-      exit 1
-    fi
-  ping_readiness_master.sh: |-
-    #!/bin/bash
-
-    [[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")"
-    [[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD"
-    response=$(
-      timeout -s 15 $1 \
-      redis-cli \
-        -h $REDIS_MASTER_HOST \
-        -p $REDIS_MASTER_PORT_NUMBER \
-        ping
-    )
-    if [ "$?" -eq "124" ]; then
-      echo "Timed out"
-      exit 1
-    fi
-    if [ "$response" != "PONG" ]; then
-      echo "$response"
-      exit 1
-    fi
-  ping_liveness_master.sh: |-
-    #!/bin/bash
-
-    [[ -f $REDIS_MASTER_PASSWORD_FILE ]] && export REDIS_MASTER_PASSWORD="$(< "${REDIS_MASTER_PASSWORD_FILE}")"
-    [[ -n "$REDIS_MASTER_PASSWORD" ]] && export REDISCLI_AUTH="$REDIS_MASTER_PASSWORD"
-    response=$(
-      timeout -s 15 $1 \
-      redis-cli \
-        -h $REDIS_MASTER_HOST \
-        -p $REDIS_MASTER_PORT_NUMBER \
-        ping
-    )
-    if [ "$?" -eq "124" ]; then
-      echo "Timed out"
-      exit 1
-    fi
-    responseFirstWord=$(echo $response | head -n1 | awk '{print $1;}')
-    if [ "$response" != "PONG" ] && [ "$responseFirstWord" != "LOADING" ]; then
-      echo "$response"
-      exit 1
-    fi
-  ping_readiness_local_and_master.sh: |-
-    script_dir="$(dirname "$0")"
-    exit_status=0
-    "$script_dir/ping_readiness_local.sh" $1 || exit_status=$?
-    "$script_dir/ping_readiness_master.sh" $1 || exit_status=$?
-    exit $exit_status
-  ping_liveness_local_and_master.sh: |-
-    script_dir="$(dirname "$0")"
-    exit_status=0
-    "$script_dir/ping_liveness_local.sh" $1 || exit_status=$?
-    "$script_dir/ping_liveness_master.sh" $1 || exit_status=$?
-    exit $exit_status
---
-# Source: matrix-synapse/charts/redis/templates/scripts-configmap.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: othrys-synapse-redis-scripts
-  labels:
-    app.kubernetes.io/instance: othrys-synapse
-    app.kubernetes.io/managed-by: Helm
-    app.kubernetes.io/name: redis
-    helm.sh/chart: redis-17.17.1
-data:
-  start-master.sh: |
-    #!/bin/bash
-
-    [[ -f $REDIS_PASSWORD_FILE ]] && export REDIS_PASSWORD="$(< "${REDIS_PASSWORD_FILE}")"
-    if [[ -f /opt/bitnami/redis/mounted-etc/master.conf ]];then
-        cp /opt/bitnami/redis/mounted-etc/master.conf /opt/bitnami/redis/etc/master.conf
-    fi
-    if [[ -f /opt/bitnami/redis/mounted-etc/redis.conf ]];then
-        cp /opt/bitnami/redis/mounted-etc/redis.conf /opt/bitnami/redis/etc/redis.conf
-    fi
-    ARGS=("--port" "${REDIS_PORT}")
-    ARGS+=("--requirepass" "${REDIS_PASSWORD}")
-    ARGS+=("--masterauth" "${REDIS_PASSWORD}")
-    ARGS+=("--include" "/opt/bitnami/redis/etc/redis.conf")
-    ARGS+=("--include" "/opt/bitnami/redis/etc/master.conf")
-    exec redis-server "${ARGS[@]}"
---
 # Source: matrix-synapse/templates/configuration.yaml
 apiVersion: v1
 kind: ConfigMap
@ -870,64 +736,6 @@ metadata:
    app.kubernetes.io/component: signingkey-job
 ---
 # Source: matrix-synapse/templates/signing-key-job.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: othrys-synapse-matrix-synapse-scripts
-  labels:
-    helm.sh/chart: matrix-synapse-3.12.17
-    app.kubernetes.io/name: matrix-synapse
-    app.kubernetes.io/instance: othrys-synapse
-    app.kubernetes.io/version: "1.144.0"
-    app.kubernetes.io/managed-by: Helm
-  annotations:
-    helm.sh/hook: pre-install
-    helm.sh/hook-delete-policy: hook-succeeded
-data:
-  signing-key.sh: |
-    #!/bin/sh
-  
-    set -eu
-  
-    check_key() {
-      set +e
-  
-      echo "Checking for existing signing key..."
-      key="$(kubectl get secret "$SECRET_NAME" -o jsonpath="{.data['signing\.key']}" 2> /dev/null)"
-      [ $? -ne 0 ] && return 1
-      [ -z "$key" ] && return 2
-      return 0
-    }
-  
-    create_key() {
-      echo "Waiting for new signing key to be generated..."
-      begin=$(date +%s)
-      end=$((begin + 300)) # 5 minutes
-      while true; do
-        [ -f /synapse/keys/signing.key ] && return 0
-        [ "$(date +%s)" -gt $end ] && return 1
-        sleep 5
-      done
-    }
-  
-    store_key() {
-      echo "Storing signing key in Kubernetes secret..."
-      kubectl patch secret "$SECRET_NAME" -p "{\"data\":{\"signing.key\":\"$(base64 /synapse/keys/signing.key | tr -d '\n')\"}}"
-    }
-  
-    if check_key; then
-      echo "Key already in place, exiting."
-      exit
-    fi
-  
-    if ! create_key; then
-      echo "Timed out waiting for a signing key to appear."
-      exit 1
-    fi
-  
-    store_key
---
-# Source: matrix-synapse/templates/signing-key-job.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
--- a/services/jenkins/configmap-init-scripts.yaml
+++ b/services/jenkins/configmap-init-scripts.yaml
@ -1,24 +0,0 @@
-# services/jenkins/configmap-init-scripts.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: jenkins-init-scripts
-  namespace: jenkins
-data:
-  theme.groovy: |
-    import jenkins.model.Jenkins
-    import org.codefirst.SimpleThemeDecorator
-
-    def instance = Jenkins.get()
-    def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
-
-    if (decorators?.size() > 0) {
-      def theme = decorators[0]
-      theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css")
-      theme.setJsUrl("")
-      theme.setTheme("")
-      instance.save()
-      println("Applied simple-theme-plugin dark theme")
-    } else {
-      println("simple-theme-plugin not installed; skipping theme configuration")
-    }
--- a/services/jenkins/kustomization.yaml
+++ b/services/jenkins/kustomization.yaml
@ -7,8 +7,15 @@ resources:
  - serviceaccount.yaml
  - pvc.yaml
  - configmap-jcasc.yaml
-  - configmap-init-scripts.yaml
  - configmap-plugins.yaml
  - deployment.yaml
  - service.yaml
  - ingress.yaml
+
+configMapGenerator:
+  - name: jenkins-init-scripts
+    namespace: jenkins
+    files:
+      - theme.groovy=scripts/theme.groovy
+    options:
+      disableNameSuffixHash: true
--- a/services/jenkins/scripts/theme.groovy
+++ b/services/jenkins/scripts/theme.groovy
@ -0,0 +1,16 @@
+import jenkins.model.Jenkins
+import org.codefirst.SimpleThemeDecorator
+
+def instance = Jenkins.get()
+def decorators = instance.getExtensionList(SimpleThemeDecorator.class)
+
+if (decorators?.size() > 0) {
+  def theme = decorators[0]
+  theme.setCssUrl("https://jenkins-contrib-themes.github.io/jenkins-material-theme/dist/material-ocean.css")
+  theme.setJsUrl("")
+  theme.setTheme("")
+  instance.save()
+  println("Applied simple-theme-plugin dark theme")
+} else {
+  println("simple-theme-plugin not installed; skipping theme configuration")
+}
--- a/services/logging/kustomization.yaml
+++ b/services/logging/kustomization.yaml
@ -6,11 +6,8 @@ resources:
  - opensearch-dashboards-objects.yaml
  - opensearch-observability-objects.yaml
  - node-log-rotation-serviceaccount.yaml
-  - node-log-rotation-script.yaml
  - node-image-gc-rpi4-serviceaccount.yaml
-  - node-image-gc-rpi4-script.yaml
  - node-image-prune-rpi5-serviceaccount.yaml
-  - node-image-prune-rpi5-script.yaml
  - opensearch-pvc.yaml
  - opensearch-helmrelease.yaml
  - opensearch-dashboards-helmrelease.yaml
@ -26,3 +23,35 @@ resources:
  - node-image-prune-rpi5-daemonset.yaml
  - oauth2-proxy.yaml
  - ingress.yaml
+
+configMapGenerator:
+  - name: node-log-rotation-script
+    namespace: logging
+    files:
+      - node_log_rotation.sh=scripts/node_log_rotation.sh
+    options:
+      disableNameSuffixHash: true
+  - name: node-image-gc-rpi4-script
+    namespace: logging
+    files:
+      - node_image_gc_rpi4.sh=scripts/node_image_gc_rpi4.sh
+    options:
+      disableNameSuffixHash: true
+  - name: node-image-prune-rpi5-script
+    namespace: logging
+    files:
+      - node_image_prune_rpi5.sh=scripts/node_image_prune_rpi5.sh
+    options:
+      disableNameSuffixHash: true
+  - name: opensearch-prune-script
+    namespace: logging
+    files:
+      - prune.py=scripts/opensearch_prune.py
+    options:
+      disableNameSuffixHash: true
+  - name: opensearch-observability-script
+    namespace: logging
+    files:
+      - seed.py=scripts/opensearch_observability_seed.py
+    options:
+      disableNameSuffixHash: true
--- a/services/logging/node-image-gc-rpi4-script.yaml
+++ b/services/logging/node-image-gc-rpi4-script.yaml
@ -1,44 +0,0 @@
-# services/logging/node-image-gc-rpi4-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: node-image-gc-rpi4-script
-  namespace: logging
-data:
-  node_image_gc_rpi4.sh: |
-    #!/usr/bin/env bash
-    set -euo pipefail
-
-    changed=0
-    k3s_changed=0
-    k3s_agent_changed=0
-
-    k3s_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
-    k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
-
-    if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
-      mkdir -p "$(dirname "${k3s_dropin}")"
-      printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_dropin}"
-      changed=1
-      k3s_changed=1
-    fi
-
-    if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
-      mkdir -p "$(dirname "${k3s_agent_dropin}")"
-      printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_dropin}"
-      changed=1
-      k3s_agent_changed=1
-    fi
-
-    if [ "${changed}" -eq 1 ]; then
-      sleep "$(( (RANDOM % 300) + 10 ))"
-      chroot /host /bin/systemctl daemon-reload
-      if [ "${k3s_changed}" -eq 1 ]; then
-        chroot /host /bin/systemctl restart k3s
-      fi
-      if [ "${k3s_agent_changed}" -eq 1 ]; then
-        chroot /host /bin/systemctl restart k3s-agent
-      fi
-    fi
-
-    sleep infinity
--- a/services/logging/node-image-prune-rpi5-script.yaml
+++ b/services/logging/node-image-prune-rpi5-script.yaml
@ -1,34 +0,0 @@
-# services/logging/node-image-prune-rpi5-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: node-image-prune-rpi5-script
-  namespace: logging
-data:
-  node_image_prune_rpi5.sh: |
-    #!/usr/bin/env bash
-    set -euo pipefail
-
-    threshold=70
-
-    sleep "$(( (RANDOM % 300) + 10 ))"
-
-    while true; do
-      usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}')
-      if [ -z "${usage}" ]; then
-        sleep 1800
-        continue
-      fi
-
-      if [ "${usage}" -ge "${threshold}" ]; then
-        chroot /host /bin/sh -c '
-          if command -v crictl >/dev/null 2>&1; then
-            crictl --runtime-endpoint=unix:///run/k3s/containerd/containerd.sock rmi --prune || true
-          elif [ -x /usr/local/bin/crictl ]; then
-            /usr/local/bin/crictl --runtime-endpoint=unix:///run/k3s/containerd/containerd.sock rmi --prune || true
-          fi
-        '
-      fi
-
-      sleep 21600
-    done
--- a/services/logging/node-log-rotation-script.yaml
+++ b/services/logging/node-log-rotation-script.yaml
@ -1,72 +0,0 @@
-# services/logging/node-log-rotation-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: node-log-rotation-script
-  namespace: logging
-data:
-  node_log_rotation.sh: |
-    #!/usr/bin/env bash
-    set -euo pipefail
-
-    changed=0
-    journald_changed=0
-    k3s_changed=0
-    k3s_agent_changed=0
-
-    journald_dropin="/host/etc/systemd/journald.conf.d/99-logging.conf"
-    k3s_dropin="/host/etc/systemd/system/k3s.service.d/99-logging.conf"
-    k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
-    k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
-    k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
-
-    if [ ! -f "${journald_dropin}" ]; then
-      mkdir -p "$(dirname "${journald_dropin}")"
-      printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
-      changed=1
-      journald_changed=1
-    fi
-
-    if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
-      mkdir -p "$(dirname "${k3s_dropin}")"
-      printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
-      changed=1
-      k3s_changed=1
-    fi
-
-    if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
-      mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
-      printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
-      changed=1
-      k3s_changed=1
-    fi
-
-    if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
-      mkdir -p "$(dirname "${k3s_agent_dropin}")"
-      printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
-      changed=1
-      k3s_agent_changed=1
-    fi
-
-    if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
-      mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
-      printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
-      changed=1
-      k3s_agent_changed=1
-    fi
-
-    if [ "${changed}" -eq 1 ]; then
-      sleep "$(( (RANDOM % 300) + 10 ))"
-      chroot /host /bin/systemctl daemon-reload
-      if [ "${journald_changed}" -eq 1 ]; then
-        chroot /host /bin/systemctl restart systemd-journald
-      fi
-      if [ "${k3s_changed}" -eq 1 ]; then
-        chroot /host /bin/systemctl restart k3s
-      fi
-      if [ "${k3s_agent_changed}" -eq 1 ]; then
-        chroot /host /bin/systemctl restart k3s-agent
-      fi
-    fi
-
-    sleep infinity
--- a/services/logging/opensearch-observability-setup-job.yaml
+++ b/services/logging/opensearch-observability-setup-job.yaml
@ -1,152 +1,4 @@
 # services/logging/opensearch-observability-setup-job.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: opensearch-observability-script
-  namespace: logging
-data:
-  seed.py: |
-    import json
-    import os
-    import time
-    import urllib.error
-    import urllib.request
-
-    OSD_URL = os.environ.get(
-        "OSD_URL",
-        "http://opensearch-dashboards.logging.svc.cluster.local:5601",
-    ).rstrip("/")
-    OBJECT_DIR = "/config"
-
-    def request_json(method, path, payload=None):
-        url = f"{OSD_URL}{path}"
-        data = None
-        headers = {"osd-xsrf": "true"}
-        if payload is not None:
-            data = json.dumps(payload).encode("utf-8")
-            headers["Content-Type"] = "application/json"
-
-        req = urllib.request.Request(url, data=data, method=method)
-        for key, value in headers.items():
-            req.add_header(key, value)
-
-        try:
-            with urllib.request.urlopen(req, timeout=30) as response:
-                body = response.read().decode("utf-8")
-        except urllib.error.HTTPError as exc:
-            detail = exc.read().decode("utf-8")
-            raise SystemExit(f"{method} {path} failed: {exc.code} {detail}")
-
-        if not body:
-            return {}
-        return json.loads(body)
-
-
-    def wait_ready():
-        for _ in range(60):
-            try:
-                request_json("GET", "/api/status")
-                return
-            except Exception:
-                time.sleep(5)
-        raise SystemExit("OpenSearch Dashboards did not become ready in time")
-
-
-    def load_payload(name):
-        path = os.path.join(OBJECT_DIR, name)
-        with open(path, "r", encoding="utf-8") as handle:
-            return json.load(handle)
-
-
-    def index_by_name(items, key):
-        lookup = {}
-        for item in items:
-            obj = item.get(key, {})
-            name = obj.get("name")
-            if not name:
-                continue
-            lookup.setdefault(name, item)
-        return lookup
-
-
-    def ensure_applications(apps):
-        existing = request_json("GET", "/api/observability/application/").get("data", [])
-        existing_by_name = {app.get("name"): app for app in existing if app.get("name")}
-
-        for app in apps:
-            name = app.get("name")
-            if not name:
-                continue
-            current = existing_by_name.get(name)
-            if not current:
-                request_json("POST", "/api/observability/application/", app)
-                print(f"created application: {name}")
-                continue
-
-            if app.get("baseQuery") != current.get("baseQuery"):
-                print(f"baseQuery differs for {name}; skipping update")
-
-            update_body = {}
-            for key in ("description", "servicesEntities", "traceGroups"):
-                if app.get(key, "") != current.get(key, ""):
-                    update_body[key] = app.get(key, "")
-
-            if update_body:
-                request_json(
-                    "PUT",
-                    "/api/observability/application/",
-                    {"appId": current["id"], "updateBody": update_body},
-                )
-                print(f"updated application: {name}")
-
-
-    def ensure_saved_objects(objects, object_type, endpoint):
-        existing = request_json(
-            "GET",
-            f"/api/observability/event_analytics/saved_objects?objectType={object_type}",
-        ).get("observabilityObjectList", [])
-        key = "savedQuery" if object_type == "savedQuery" else "savedVisualization"
-        existing_by_name = index_by_name(existing, key)
-
-        for obj in objects:
-            name = obj.get("name")
-            if not name:
-                continue
-            current = existing_by_name.get(name)
-            if not current:
-                request_json("POST", endpoint, {"object": obj})
-                print(f"created {object_type}: {name}")
-                continue
-
-            current_body = current.get(key, {})
-            if current_body != obj:
-                request_json(
-                    "PUT",
-                    endpoint,
-                    {"object_id": current["objectId"], "object": obj},
-                )
-                print(f"updated {object_type}: {name}")
-
-
-    def main():
-        wait_ready()
-
-        applications = load_payload("applications.json")
-        queries = load_payload("saved_queries.json")
-        visualizations = load_payload("saved_visualizations.json")
-
-        ensure_applications(applications)
-        ensure_saved_objects(queries, "savedQuery", "/api/observability/event_analytics/saved_objects/query")
-        ensure_saved_objects(
-            visualizations,
-            "savedVisualization",
-            "/api/observability/event_analytics/saved_objects/vis",
-        )
-
-
-    if __name__ == "__main__":
-        main()
---
 apiVersion: batch/v1
 kind: Job
 metadata:
--- a/services/logging/opensearch-prune-cronjob.yaml
+++ b/services/logging/opensearch-prune-cronjob.yaml
@ -1,89 +1,4 @@
 # services/logging/opensearch-prune-cronjob.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: opensearch-prune-script
-  namespace: logging
-data:
-  prune.py: |
-    import json
-    import os
-    import re
-    import sys
-    import urllib.error
-    import urllib.request
-
-    os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
-    limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
-    patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
-
-    UNITS = {
-        "b": 1,
-        "kb": 1024,
-        "mb": 1024**2,
-        "gb": 1024**3,
-        "tb": 1024**4,
-    }
-
-    def parse_size(value: str) -> int:
-        if not value:
-            return 0
-        text = value.strip().lower()
-        if text in ("-", "0"):
-            return 0
-        match = re.match(r"^([0-9.]+)([a-z]+)$", text)
-        if not match:
-            return 0
-        number = float(match.group(1))
-        unit = match.group(2)
-        if unit not in UNITS:
-            return 0
-        return int(number * UNITS[unit])
-
-    def request_json(path: str):
-        url = f"{os_url}{path}"
-        with urllib.request.urlopen(url, timeout=30) as response:
-            payload = response.read().decode("utf-8")
-        return json.loads(payload)
-
-    def delete_index(index: str) -> None:
-        url = f"{os_url}/{index}"
-        req = urllib.request.Request(url, method="DELETE")
-        with urllib.request.urlopen(req, timeout=30) as response:
-            _ = response.read()
-        print(f"deleted {index}")
-
-    indices = []
-    for pattern in patterns:
-        try:
-            data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
-        except urllib.error.HTTPError as exc:
-            if exc.code == 404:
-                continue
-            raise
-        for item in data:
-            index = item.get("index")
-            if not index or index.startswith("."):
-                continue
-            size = parse_size(item.get("store.size", ""))
-            created = int(item.get("creation.date", "0") or 0)
-            indices.append({"index": index, "size": size, "created": created})
-
-    total = sum(item["size"] for item in indices)
-    print(f"total_log_bytes={total}")
-    if total <= limit_bytes:
-        print("within limit")
-        sys.exit(0)
-
-    indices.sort(key=lambda item: item["created"])
-    for item in indices:
-        if total <= limit_bytes:
-            break
-        delete_index(item["index"])
-        total -= item["size"]
-
-    print(f"remaining_log_bytes={total}")
---
 apiVersion: batch/v1
 kind: CronJob
 metadata:
--- a/services/logging/scripts/node_image_gc_rpi4.sh
+++ b/services/logging/scripts/node_image_gc_rpi4.sh
@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+changed=0
+k3s_changed=0
+k3s_agent_changed=0
+
+k3s_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
+k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
+
+if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
+  mkdir -p "$(dirname "${k3s_dropin}")"
+  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_dropin}"
+  changed=1
+  k3s_changed=1
+fi
+
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
+  mkdir -p "$(dirname "${k3s_agent_dropin}")"
+  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_dropin}"
+  changed=1
+  k3s_agent_changed=1
+fi
+
+if [ "${changed}" -eq 1 ]; then
+  sleep "$(( (RANDOM % 300) + 10 ))"
+  chroot /host /bin/systemctl daemon-reload
+  if [ "${k3s_changed}" -eq 1 ]; then
+    chroot /host /bin/systemctl restart k3s
+  fi
+  if [ "${k3s_agent_changed}" -eq 1 ]; then
+    chroot /host /bin/systemctl restart k3s-agent
+  fi
+fi
+
+sleep infinity
--- a/services/logging/scripts/node_image_prune_rpi5.sh
+++ b/services/logging/scripts/node_image_prune_rpi5.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+threshold=70
+
+sleep "$(( (RANDOM % 300) + 10 ))"
+
+while true; do
+  usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}')
+  if [ -z "${usage}" ]; then
+    sleep 1800
+    continue
+  fi
+
+  if [ "${usage}" -ge "${threshold}" ]; then
+    chroot /host /bin/sh -c '
+      if command -v crictl >/dev/null 2>&1; then
+        crictl --runtime-endpoint=unix:///run/k3s/containerd/containerd.sock rmi --prune || true
+      elif [ -x /usr/local/bin/crictl ]; then
+        /usr/local/bin/crictl --runtime-endpoint=unix:///run/k3s/containerd/containerd.sock rmi --prune || true
+      fi
+    '
+  fi
+
+  sleep 21600
+done
--- a/services/logging/scripts/node_log_rotation.sh
+++ b/services/logging/scripts/node_log_rotation.sh
@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+changed=0
+journald_changed=0
+k3s_changed=0
+k3s_agent_changed=0
+
+journald_dropin="/host/etc/systemd/journald.conf.d/99-logging.conf"
+k3s_dropin="/host/etc/systemd/system/k3s.service.d/99-logging.conf"
+k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
+k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
+k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
+
+if [ ! -f "${journald_dropin}" ]; then
+  mkdir -p "$(dirname "${journald_dropin}")"
+  printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
+  changed=1
+  journald_changed=1
+fi
+
+if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
+  mkdir -p "$(dirname "${k3s_dropin}")"
+  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
+  changed=1
+  k3s_changed=1
+fi
+
+if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
+  mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
+  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
+  changed=1
+  k3s_changed=1
+fi
+
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
+  mkdir -p "$(dirname "${k3s_agent_dropin}")"
+  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
+  changed=1
+  k3s_agent_changed=1
+fi
+
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
+  mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
+  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
+  changed=1
+  k3s_agent_changed=1
+fi
+
+if [ "${changed}" -eq 1 ]; then
+  sleep "$(( (RANDOM % 300) + 10 ))"
+  chroot /host /bin/systemctl daemon-reload
+  if [ "${journald_changed}" -eq 1 ]; then
+    chroot /host /bin/systemctl restart systemd-journald
+  fi
+  if [ "${k3s_changed}" -eq 1 ]; then
+    chroot /host /bin/systemctl restart k3s
+  fi
+  if [ "${k3s_agent_changed}" -eq 1 ]; then
+    chroot /host /bin/systemctl restart k3s-agent
+  fi
+fi
+
+sleep infinity
--- a/services/logging/scripts/opensearch_observability_seed.py
+++ b/services/logging/scripts/opensearch_observability_seed.py
@ -0,0 +1,140 @@
+import json
+import os
+import time
+import urllib.error
+import urllib.request
+
+OSD_URL = os.environ.get(
+    "OSD_URL",
+    "http://opensearch-dashboards.logging.svc.cluster.local:5601",
+).rstrip("/")
+OBJECT_DIR = "/config"
+
+def request_json(method, path, payload=None):
+    url = f"{OSD_URL}{path}"
+    data = None
+    headers = {"osd-xsrf": "true"}
+    if payload is not None:
+        data = json.dumps(payload).encode("utf-8")
+        headers["Content-Type"] = "application/json"
+
+    req = urllib.request.Request(url, data=data, method=method)
+    for key, value in headers.items():
+        req.add_header(key, value)
+
+    try:
+        with urllib.request.urlopen(req, timeout=30) as response:
+            body = response.read().decode("utf-8")
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8")
+        raise SystemExit(f"{method} {path} failed: {exc.code} {detail}")
+
+    if not body:
+        return {}
+    return json.loads(body)
+
+
+def wait_ready():
+    for _ in range(60):
+        try:
+            request_json("GET", "/api/status")
+            return
+        except Exception:
+            time.sleep(5)
+    raise SystemExit("OpenSearch Dashboards did not become ready in time")
+
+
+def load_payload(name):
+    path = os.path.join(OBJECT_DIR, name)
+    with open(path, "r", encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+def index_by_name(items, key):
+    lookup = {}
+    for item in items:
+        obj = item.get(key, {})
+        name = obj.get("name")
+        if not name:
+            continue
+        lookup.setdefault(name, item)
+    return lookup
+
+
+def ensure_applications(apps):
+    existing = request_json("GET", "/api/observability/application/").get("data", [])
+    existing_by_name = {app.get("name"): app for app in existing if app.get("name")}
+
+    for app in apps:
+        name = app.get("name")
+        if not name:
+            continue
+        current = existing_by_name.get(name)
+        if not current:
+            request_json("POST", "/api/observability/application/", app)
+            print(f"created application: {name}")
+            continue
+
+        if app.get("baseQuery") != current.get("baseQuery"):
+            print(f"baseQuery differs for {name}; skipping update")
+
+        update_body = {}
+        for key in ("description", "servicesEntities", "traceGroups"):
+            if app.get(key, "") != current.get(key, ""):
+                update_body[key] = app.get(key, "")
+
+        if update_body:
+            request_json(
+                "PUT",
+                "/api/observability/application/",
+                {"appId": current["id"], "updateBody": update_body},
+            )
+            print(f"updated application: {name}")
+
+
+def ensure_saved_objects(objects, object_type, endpoint):
+    existing = request_json(
+        "GET",
+        f"/api/observability/event_analytics/saved_objects?objectType={object_type}",
+    ).get("observabilityObjectList", [])
+    key = "savedQuery" if object_type == "savedQuery" else "savedVisualization"
+    existing_by_name = index_by_name(existing, key)
+
+    for obj in objects:
+        name = obj.get("name")
+        if not name:
+            continue
+        current = existing_by_name.get(name)
+        if not current:
+            request_json("POST", endpoint, {"object": obj})
+            print(f"created {object_type}: {name}")
+            continue
+
+        current_body = current.get(key, {})
+        if current_body != obj:
+            request_json(
+                "PUT",
+                endpoint,
+                {"object_id": current["objectId"], "object": obj},
+            )
+            print(f"updated {object_type}: {name}")
+
+
+def main():
+    wait_ready()
+
+    applications = load_payload("applications.json")
+    queries = load_payload("saved_queries.json")
+    visualizations = load_payload("saved_visualizations.json")
+
+    ensure_applications(applications)
+    ensure_saved_objects(queries, "savedQuery", "/api/observability/event_analytics/saved_objects/query")
+    ensure_saved_objects(
+        visualizations,
+        "savedVisualization",
+        "/api/observability/event_analytics/saved_objects/vis",
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/services/logging/scripts/opensearch_prune.py
+++ b/services/logging/scripts/opensearch_prune.py
@ -0,0 +1,77 @@
+import json
+import os
+import re
+import sys
+import urllib.error
+import urllib.request
+
+os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
+limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
+patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
+
+UNITS = {
+    "b": 1,
+    "kb": 1024,
+    "mb": 1024**2,
+    "gb": 1024**3,
+    "tb": 1024**4,
+}
+
+def parse_size(value: str) -> int:
+    if not value:
+        return 0
+    text = value.strip().lower()
+    if text in ("-", "0"):
+        return 0
+    match = re.match(r"^([0-9.]+)([a-z]+)$", text)
+    if not match:
+        return 0
+    number = float(match.group(1))
+    unit = match.group(2)
+    if unit not in UNITS:
+        return 0
+    return int(number * UNITS[unit])
+
+def request_json(path: str):
+    url = f"{os_url}{path}"
+    with urllib.request.urlopen(url, timeout=30) as response:
+        payload = response.read().decode("utf-8")
+    return json.loads(payload)
+
+def delete_index(index: str) -> None:
+    url = f"{os_url}/{index}"
+    req = urllib.request.Request(url, method="DELETE")
+    with urllib.request.urlopen(req, timeout=30) as response:
+        _ = response.read()
+    print(f"deleted {index}")
+
+indices = []
+for pattern in patterns:
+    try:
+        data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
+    except urllib.error.HTTPError as exc:
+        if exc.code == 404:
+            continue
+        raise
+    for item in data:
+        index = item.get("index")
+        if not index or index.startswith("."):
+            continue
+        size = parse_size(item.get("store.size", ""))
+        created = int(item.get("creation.date", "0") or 0)
+        indices.append({"index": index, "size": size, "created": created})
+
+total = sum(item["size"] for item in indices)
+print(f"total_log_bytes={total}")
+if total <= limit_bytes:
+    print("within limit")
+    sys.exit(0)
+
+indices.sort(key=lambda item: item["created"])
+for item in indices:
+    if total <= limit_bytes:
+        break
+    delete_index(item["index"])
+    total -= item["size"]
+
+print(f"remaining_log_bytes={total}")
--- a/services/maintenance/kustomization.yaml
+++ b/services/maintenance/kustomization.yaml
@ -5,11 +5,28 @@ resources:
  - namespace.yaml
  - node-nofile-serviceaccount.yaml
  - pod-cleaner-rbac.yaml
-  - node-nofile-script.yaml
-  - pod-cleaner-script.yaml
  - node-nofile-daemonset.yaml
  - pod-cleaner-cronjob.yaml
  - node-image-sweeper-serviceaccount.yaml
-  - node-image-sweeper-script.yaml
  - node-image-sweeper-daemonset.yaml
  - image-sweeper-cronjob.yaml
+
+configMapGenerator:
+  - name: node-nofile-script
+    namespace: maintenance
+    files:
+      - node_nofile.sh=scripts/node_nofile.sh
+    options:
+      disableNameSuffixHash: true
+  - name: pod-cleaner-script
+    namespace: maintenance
+    files:
+      - pod_cleaner.sh=scripts/pod_cleaner.sh
+    options:
+      disableNameSuffixHash: true
+  - name: node-image-sweeper-script
+    namespace: maintenance
+    files:
+      - node_image_sweeper.sh=scripts/node_image_sweeper.sh
+    options:
+      disableNameSuffixHash: true
--- a/services/maintenance/node-image-sweeper-script.yaml
+++ b/services/maintenance/node-image-sweeper-script.yaml
@ -1,100 +0,0 @@
-# services/maintenance/node-image-sweeper-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: node-image-sweeper-script
-  namespace: maintenance
-data:
-  node_image_sweeper.sh: |
-    #!/bin/sh
-    set -eu
-
-    ONE_SHOT=${ONE_SHOT:-false}
-    THRESHOLD_DAYS=14
-
-    usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
-    if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
-      THRESHOLD_DAYS=3
-    fi
-
-    cutoff=$(python3 - <<'PY'
-    import time, os
-    print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
-    PY
-    )
-
-    RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
-    IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
-
-    SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
-
-    prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
-    import json, os, sys, time
-
-    try:
-        data = json.load(sys.stdin)
-    except Exception:
-        print("", end="")
-        sys.exit(0)
-
-    cutoff = int(os.environ.get("CUTOFF", "0"))
-    running = set(os.environ.get("RUNNING", "").split())
-    skip = os.environ.get("SKIP", "").split()
-    now = int(time.time())
-    prune = []
-
-
-    def is_skip(tags):
-        if not tags:
-            return False
-        for t in tags:
-            for prefix in skip:
-                if prefix and t.startswith(prefix):
-                    return True
-        return False
-
-
-    for img in data.get("images", []):
-        image_id = img.get("id", "")
-        if not image_id:
-            continue
-        if image_id in running:
-            continue
-        tags = img.get("repoTags") or []
-        if is_skip(tags):
-            continue
-        created = img.get("createdAt") or 0
-        try:
-            created = int(str(created)) // 1000000000
-        except Exception:
-            created = 0
-        if created and created > now:
-            created = now
-        if cutoff and created and created < cutoff:
-            prune.append(image_id)
-
-    seen = set()
-    for p in prune:
-        if p in seen:
-            continue
-        seen.add(p)
-        print(p)
-    PY
-    )
-
-    if [ -n "${prune_list}" ]; then
-      printf "%s" "${prune_list}" | while read -r image_id; do
-        if [ -n "${image_id}" ]; then
-          chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
-        fi
-      done
-    fi
-
-    find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
-    find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
-
-    if [ "${ONE_SHOT}" = "true" ]; then
-      exit 0
-    fi
-
-    sleep infinity
--- a/services/maintenance/node-nofile-script.yaml
+++ b/services/maintenance/node-nofile-script.yaml
@ -1,38 +0,0 @@
-# services/maintenance/node-nofile-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: node-nofile-script
-  namespace: maintenance
-data:
-  node_nofile.sh: |
-    #!/usr/bin/env bash
-    set -euo pipefail
-
-    limit_line="LimitNOFILE=1048576"
-    changed=0
-
-    for unit in k3s k3s-agent; do
-      unit_file="/host/etc/systemd/system/${unit}.service"
-      if [ -f "${unit_file}" ]; then
-        dropin_dir="/host/etc/systemd/system/${unit}.service.d"
-        dropin_file="${dropin_dir}/99-nofile.conf"
-        if [ ! -f "${dropin_file}" ] || ! grep -q "${limit_line}" "${dropin_file}"; then
-          mkdir -p "${dropin_dir}"
-          printf "[Service]\n%s\n" "${limit_line}" > "${dropin_file}"
-          changed=1
-        fi
-      fi
-    done
-
-    if [ "${changed}" -eq 1 ]; then
-      sleep "$(( (RANDOM % 300) + 10 ))"
-      chroot /host /bin/systemctl daemon-reload
-      for unit in k3s k3s-agent; do
-        if [ -f "/host/etc/systemd/system/${unit}.service" ]; then
-          chroot /host /bin/systemctl restart "${unit}"
-        fi
-      done
-    fi
-
-    sleep infinity
--- a/services/maintenance/pod-cleaner-script.yaml
+++ b/services/maintenance/pod-cleaner-script.yaml
@ -1,20 +0,0 @@
-# services/maintenance/pod-cleaner-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: pod-cleaner-script
-  namespace: maintenance
-data:
-  pod_cleaner.sh: |
-    #!/usr/bin/env bash
-    set -euo pipefail
-
-    for phase in Succeeded Failed; do
-      kubectl get pods -A --field-selector="status.phase=${phase}" \
-        -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \
-        | while read -r namespace name; do
-            if [ -n "${namespace}" ] && [ -n "${name}" ]; then
-              kubectl delete pod -n "${namespace}" "${name}" --ignore-not-found --grace-period=0 --wait=false
-            fi
-          done
-    done
--- a/services/maintenance/scripts/node_image_sweeper.sh
+++ b/services/maintenance/scripts/node_image_sweeper.sh
@ -0,0 +1,92 @@
+#!/bin/sh
+set -eu
+
+ONE_SHOT=${ONE_SHOT:-false}
+THRESHOLD_DAYS=14
+
+usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
+if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
+  THRESHOLD_DAYS=3
+fi
+
+cutoff=$(python3 - <<'PY'
+import time, os
+print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
+PY
+)
+
+RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
+IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
+
+SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
+
+prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
+import json, os, sys, time
+
+try:
+    data = json.load(sys.stdin)
+except Exception:
+    print("", end="")
+    sys.exit(0)
+
+cutoff = int(os.environ.get("CUTOFF", "0"))
+running = set(os.environ.get("RUNNING", "").split())
+skip = os.environ.get("SKIP", "").split()
+now = int(time.time())
+prune = []
+
+
+def is_skip(tags):
+    if not tags:
+        return False
+    for t in tags:
+        for prefix in skip:
+            if prefix and t.startswith(prefix):
+                return True
+    return False
+
+
+for img in data.get("images", []):
+    image_id = img.get("id", "")
+    if not image_id:
+        continue
+    if image_id in running:
+        continue
+    tags = img.get("repoTags") or []
+    if is_skip(tags):
+        continue
+    created = img.get("createdAt") or 0
+    try:
+        created = int(str(created)) // 1000000000
+    except Exception:
+        created = 0
+    if created and created > now:
+        created = now
+    if cutoff and created and created < cutoff:
+        prune.append(image_id)
+
+seen = set()
+for p in prune:
+    if p in seen:
+        continue
+    seen.add(p)
+    print(p)
+PY
+)
+
+if [ -n "${prune_list}" ]; then
+  printf "%s" "${prune_list}" | while read -r image_id; do
+    if [ -n "${image_id}" ]; then
+      chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
+    fi
+  done
+fi
+
+find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
+find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
+
+if [ "${ONE_SHOT}" = "true" ]; then
+  exit 0
+fi
+
+sleep infinity
--- a/services/maintenance/scripts/node_nofile.sh
+++ b/services/maintenance/scripts/node_nofile.sh
@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+limit_line="LimitNOFILE=1048576"
+changed=0
+
+for unit in k3s k3s-agent; do
+  unit_file="/host/etc/systemd/system/${unit}.service"
+  if [ -f "${unit_file}" ]; then
+    dropin_dir="/host/etc/systemd/system/${unit}.service.d"
+    dropin_file="${dropin_dir}/99-nofile.conf"
+    if [ ! -f "${dropin_file}" ] || ! grep -q "${limit_line}" "${dropin_file}"; then
+      mkdir -p "${dropin_dir}"
+      printf "[Service]\n%s\n" "${limit_line}" > "${dropin_file}"
+      changed=1
+    fi
+  fi
+done
+
+if [ "${changed}" -eq 1 ]; then
+  sleep "$(( (RANDOM % 300) + 10 ))"
+  chroot /host /bin/systemctl daemon-reload
+  for unit in k3s k3s-agent; do
+    if [ -f "/host/etc/systemd/system/${unit}.service" ]; then
+      chroot /host /bin/systemctl restart "${unit}"
+    fi
+  done
+fi
+
+sleep infinity
--- a/services/maintenance/scripts/pod_cleaner.sh
+++ b/services/maintenance/scripts/pod_cleaner.sh
@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+for phase in Succeeded Failed; do
+  kubectl get pods -A --field-selector="status.phase=${phase}" \
+    -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \
+    | while read -r namespace name; do
+        if [ -n "${namespace}" ] && [ -n "${name}" ]; then
+          kubectl delete pod -n "${namespace}" "${name}" --ignore-not-found --grace-period=0 --wait=false
+        fi
+      done
+done
--- a/services/monitoring/grafana-smtp-sync-script.yaml
+++ b/services/monitoring/grafana-smtp-sync-script.yaml
@ -1,39 +0,0 @@
-# services/monitoring/grafana-smtp-sync-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: grafana-smtp-sync-script
-  namespace: monitoring
-data:
-  sync.sh: |
-    #!/bin/sh
-    set -euo pipefail
-
-    SOURCE_NS=${SOURCE_NS:-mailu-mailserver}
-    SOURCE_SECRET=${SOURCE_SECRET:-mailu-postmark-relay}
-    TARGET_NS=${TARGET_NS:-monitoring}
-    TARGET_SECRET=${TARGET_SECRET:-grafana-smtp}
-
-    tmp=$(mktemp)
-    cleanup() { rm -f "$tmp"; }
-    trap cleanup EXIT
-
-    kubectl -n "$SOURCE_NS" get secret "$SOURCE_SECRET" -o json > "$tmp"
-
-    pass=$(jq -r '.data["relay-password"]' "$tmp")
-    user=$pass
-
-    if [ -z "$user" ] || [ -z "$pass" ] || [ "$user" = "null" ] || [ "$pass" = "null" ]; then
-      echo "missing credentials from $SOURCE_NS/$SOURCE_SECRET" >&2
-      exit 1
-    fi
-
-    cat <<SECRET | kubectl -n "$TARGET_NS" apply -f -
-    apiVersion: v1
-    kind: Secret
-    metadata:
-      name: $TARGET_SECRET
-    stringData:
-      username: $(echo "$user" | base64 -d)
-      password: $(echo "$pass" | base64 -d)
-    SECRET
--- a/services/monitoring/jetson-tegrastats-exporter.yaml
+++ b/services/monitoring/jetson-tegrastats-exporter.yaml
@ -78,91 +78,3 @@ spec:
    - name: metrics
      port: 9100
      targetPort: metrics
---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: jetson-tegrastats-exporter-script
-  namespace: monitoring
-data:
-  exporter.py: |
-    import http.server
-    import os
-    import re
-    import socketserver
-    import subprocess
-    import threading
-    from time import time
-
-    PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
-    METRICS = {
-        "gr3d_freq_percent": 0.0,
-        "gpu_temp_c": 0.0,
-        "cpu_temp_c": 0.0,
-        "ram_used_mb": 0.0,
-        "ram_total_mb": 0.0,
-        "power_5v_in_mw": 0.0,
-        "last_scrape_ts": 0.0,
-    }
-    LOCK = threading.Lock()
-
-    def parse_line(line: str):
-        updates = {}
-        m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
-        if m:
-            updates["gr3d_freq_percent"] = float(m.group(1))
-        m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
-        if m:
-            updates["gpu_temp_c"] = float(m.group(1))
-        m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
-        if m:
-            updates["cpu_temp_c"] = float(m.group(1))
-        m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
-        if m:
-            updates["ram_used_mb"] = float(m.group(1))
-            updates["ram_total_mb"] = float(m.group(2))
-        m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
-        if m:
-            updates["power_5v_in_mw"] = float(m.group(1))
-        with LOCK:
-            METRICS.update(updates)
-            METRICS["last_scrape_ts"] = time()
-
-    def run_tegrastats():
-        proc = subprocess.Popen(
-            ["/host/usr/bin/tegrastats", "--interval", "1000"],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            bufsize=1,
-        )
-        for line in proc.stdout:
-            parse_line(line)
-
-    class Handler(http.server.BaseHTTPRequestHandler):
-        def do_GET(self):
-            if self.path != "/metrics":
-                self.send_response(404)
-                self.end_headers()
-                return
-            with LOCK:
-                metrics = METRICS.copy()
-            out = []
-            for k, v in metrics.items():
-                out.append(f"# TYPE jetson_{k} gauge")
-                out.append(f"jetson_{k} {v}")
-            body = "\\n".join(out) + "\\n"
-            self.send_response(200)
-            self.send_header("Content-Type", "text/plain; version=0.0.4")
-            self.send_header("Content-Length", str(len(body)))
-            self.end_headers()
-            self.wfile.write(body.encode("utf-8"))
-
-        def log_message(self, fmt, *args):
-            return
-
-    if __name__ == "__main__":
-        t = threading.Thread(target=run_tegrastats, daemon=True)
-        t.start()
-        with socketserver.TCPServer(("", PORT), Handler) as httpd:
-            httpd.serve_forever()
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@ -5,7 +5,6 @@ namespace: monitoring
 resources:
  - namespace.yaml
  - rbac.yaml
-  - postmark-exporter-script.yaml
  - grafana-dashboard-overview.yaml
  - grafana-dashboard-pods.yaml
  - grafana-dashboard-nodes.yaml
@ -20,8 +19,27 @@ resources:
  - grafana-alerting-config.yaml
  - grafana-smtp-sync-serviceaccount.yaml
  - grafana-smtp-sync-rbac.yaml
-  - grafana-smtp-sync-script.yaml
  - grafana-smtp-sync-cronjob.yaml
  - grafana-folders.yaml
  - helmrelease.yaml
  - grafana-org-bootstrap.yaml
+
+configMapGenerator:
+  - name: postmark-exporter-script
+    namespace: monitoring
+    files:
+      - monitoring_postmark_exporter.py=scripts/postmark_exporter.py
+    options:
+      disableNameSuffixHash: true
+  - name: grafana-smtp-sync-script
+    namespace: monitoring
+    files:
+      - sync.sh=scripts/grafana_smtp_sync.sh
+    options:
+      disableNameSuffixHash: true
+  - name: jetson-tegrastats-exporter-script
+    namespace: monitoring
+    files:
+      - exporter.py=scripts/jetson_tegrastats_exporter.py
+    options:
+      disableNameSuffixHash: true
--- a/services/monitoring/postmark-exporter-script.yaml
+++ b/services/monitoring/postmark-exporter-script.yaml
@ -1,156 +0,0 @@
-# services/monitoring/postmark-exporter-script.yaml
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: postmark-exporter-script
-data:
-  monitoring_postmark_exporter.py: |
-    #!/usr/bin/env python3
-
-    import datetime as dt
-    import os
-    import time
-    from dataclasses import dataclass
-
-    import requests
-    from prometheus_client import Gauge, Info, start_http_server
-
-
-    @dataclass(frozen=True)
-    class Window:
-        label: str
-        days: int
-
-
-    WINDOWS = [
-        Window("today", 0),
-        Window("1d", 1),
-        Window("7d", 7),
-        Window("30d", 30),
-    ]
-
-    API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/")
-    POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60"))
-    LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
-    LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000"))
-
-    PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip()
-    FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip()
-    LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip()
-    LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip()
-    try:
-        SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0
-    except ValueError:
-        SENDING_LIMIT = 0.0
-
-    EXPORTER_INFO = Info("postmark_exporter", "Exporter build info")
-    EXPORTER_INFO.info(
-        {
-            "api_base": API_BASE,
-            "windows": ",".join(window.label for window in WINDOWS),
-        }
-    )
-
-    POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)")
-    POSTMARK_LAST_SUCCESS = Gauge(
-        "postmark_last_success_timestamp_seconds",
-        "Unix timestamp of the last successful Postmark stats refresh",
-    )
-    POSTMARK_REQUEST_ERRORS = Gauge(
-        "postmark_request_errors_total",
-        "Total Postmark stats request errors since exporter start",
-    )
-
-    POSTMARK_OUTBOUND_SENT = Gauge(
-        "postmark_outbound_sent",
-        "Outbound emails sent within the selected window",
-        labelnames=("window",),
-    )
-    POSTMARK_OUTBOUND_BOUNCED = Gauge(
-        "postmark_outbound_bounced",
-        "Outbound emails bounced within the selected window",
-        labelnames=("window",),
-    )
-    POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge(
-        "postmark_outbound_bounce_rate",
-        "Outbound bounce rate percentage within the selected window",
-        labelnames=("window",),
-    )
-    POSTMARK_SENDING_LIMIT_GAUGE = Gauge(
-        "postmark_sending_limit",
-        "Configured Postmark sending limit for the active account",
-    )
-    POSTMARK_SENDING_LIMIT_USED = Gauge(
-        "postmark_sending_limit_used",
-        "Messages sent within the configured send limit window",
-    )
-    POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge(
-        "postmark_sending_limit_used_percent",
-        "Percent of the configured send limit used within the limit window",
-    )
-
-
-    def fetch_outbound_stats(token: str, window: Window) -> dict:
-        today = dt.date.today()
-        fromdate = today - dt.timedelta(days=window.days)
-        params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()}
-        headers = {
-            "Accept": "application/json",
-            "X-Postmark-Server-Token": token,
-        }
-        response = requests.get(
-            f"{API_BASE}/stats/outbound",
-            headers=headers,
-            params=params,
-            timeout=15,
-        )
-        response.raise_for_status()
-        return response.json()
-
-
-    def update_metrics(token: str) -> None:
-        sent_by_window = {}
-        for window in WINDOWS:
-            data = fetch_outbound_stats(token, window)
-            sent = int(data.get("Sent", 0) or 0)
-            bounced = int(data.get("Bounced", 0) or 0)
-            rate = (bounced / sent * 100.0) if sent else 0.0
-            sent_by_window[window.label] = sent
-            POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent)
-            POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced)
-            POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate)
-
-        POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT)
-        limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0)
-        POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent)
-        if SENDING_LIMIT:
-            POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0)
-        else:
-            POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0)
-
-
-    def main() -> None:
-        if not PRIMARY_TOKEN and not FALLBACK_TOKEN:
-            raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required")
-
-        start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS)
-
-        tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token]
-        token_index = 0
-
-        while True:
-            token = tokens[token_index % len(tokens)]
-            token_index += 1
-            try:
-                update_metrics(token)
-                POSTMARK_API_UP.set(1)
-                POSTMARK_LAST_SUCCESS.set(time.time())
-            except Exception as exc:  # noqa: BLE001
-                POSTMARK_API_UP.set(0)
-                POSTMARK_REQUEST_ERRORS.inc()
-                print(f"postmark_exporter: refresh failed: {exc}", flush=True)
-            time.sleep(POLL_INTERVAL_SECONDS)
-
-
-    if __name__ == "__main__":
-        main()
--- a/services/monitoring/scripts/grafana_smtp_sync.sh
+++ b/services/monitoring/scripts/grafana_smtp_sync.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+set -euo pipefail
+
+SOURCE_NS=${SOURCE_NS:-mailu-mailserver}
+SOURCE_SECRET=${SOURCE_SECRET:-mailu-postmark-relay}
+TARGET_NS=${TARGET_NS:-monitoring}
+TARGET_SECRET=${TARGET_SECRET:-grafana-smtp}
+
+tmp=$(mktemp)
+cleanup() { rm -f "$tmp"; }
+trap cleanup EXIT
+
+kubectl -n "$SOURCE_NS" get secret "$SOURCE_SECRET" -o json > "$tmp"
+
+pass=$(jq -r '.data["relay-password"]' "$tmp")
+user=$pass
+
+if [ -z "$user" ] || [ -z "$pass" ] || [ "$user" = "null" ] || [ "$pass" = "null" ]; then
+  echo "missing credentials from $SOURCE_NS/$SOURCE_SECRET" >&2
+  exit 1
+fi
+
+cat <<SECRET | kubectl -n "$TARGET_NS" apply -f -
+apiVersion: v1
+kind: Secret
+metadata:
+  name: $TARGET_SECRET
+stringData:
+  username: $(echo "$user" | base64 -d)
+  password: $(echo "$pass" | base64 -d)
+SECRET
--- a/services/monitoring/scripts/jetson_tegrastats_exporter.py
+++ b/services/monitoring/scripts/jetson_tegrastats_exporter.py
@ -0,0 +1,80 @@
+import http.server
+import os
+import re
+import socketserver
+import subprocess
+import threading
+from time import time
+
+PORT = int(os.environ.get("JETSON_EXPORTER_PORT", "9100"))
+METRICS = {
+    "gr3d_freq_percent": 0.0,
+    "gpu_temp_c": 0.0,
+    "cpu_temp_c": 0.0,
+    "ram_used_mb": 0.0,
+    "ram_total_mb": 0.0,
+    "power_5v_in_mw": 0.0,
+    "last_scrape_ts": 0.0,
+}
+LOCK = threading.Lock()
+
+def parse_line(line: str):
+    updates = {}
+    m = re.search(r"GR3D_FREQ\\s+(\\d+)%", line)
+    if m:
+        updates["gr3d_freq_percent"] = float(m.group(1))
+    m = re.search(r"GPU@(\\d+(?:\\.\\d+)?)C", line)
+    if m:
+        updates["gpu_temp_c"] = float(m.group(1))
+    m = re.search(r"CPU@(\\d+(?:\\.\\d+)?)C", line)
+    if m:
+        updates["cpu_temp_c"] = float(m.group(1))
+    m = re.search(r"RAM\\s+(\\d+)/(\\d+)MB", line)
+    if m:
+        updates["ram_used_mb"] = float(m.group(1))
+        updates["ram_total_mb"] = float(m.group(2))
+    m = re.search(r"POM_5V_IN\\s+(\\d+)/(\\d+)", line)
+    if m:
+        updates["power_5v_in_mw"] = float(m.group(1))
+    with LOCK:
+        METRICS.update(updates)
+        METRICS["last_scrape_ts"] = time()
+
+def run_tegrastats():
+    proc = subprocess.Popen(
+        ["/host/usr/bin/tegrastats", "--interval", "1000"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+    )
+    for line in proc.stdout:
+        parse_line(line)
+
+class Handler(http.server.BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path != "/metrics":
+            self.send_response(404)
+            self.end_headers()
+            return
+        with LOCK:
+            metrics = METRICS.copy()
+        out = []
+        for k, v in metrics.items():
+            out.append(f"# TYPE jetson_{k} gauge")
+            out.append(f"jetson_{k} {v}")
+        body = "\\n".join(out) + "\\n"
+        self.send_response(200)
+        self.send_header("Content-Type", "text/plain; version=0.0.4")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body.encode("utf-8"))
+
+    def log_message(self, fmt, *args):
+        return
+
+if __name__ == "__main__":
+    t = threading.Thread(target=run_tegrastats, daemon=True)
+    t.start()
+    with socketserver.TCPServer(("", PORT), Handler) as httpd:
+        httpd.serve_forever()
--- a/services/monitoring/scripts/postmark_exporter.py
+++ b/services/monitoring/scripts/postmark_exporter.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+import datetime as dt
+import os
+import time
+from dataclasses import dataclass
+
+import requests
+from prometheus_client import Gauge, Info, start_http_server
+
+
+@dataclass(frozen=True)
+class Window:
+    label: str
+    days: int
+
+
+WINDOWS = [
+    Window("today", 0),
+    Window("1d", 1),
+    Window("7d", 7),
+    Window("30d", 30),
+]
+
+API_BASE = os.environ.get("POSTMARK_API_BASE", "https://api.postmarkapp.com").rstrip("/")
+POLL_INTERVAL_SECONDS = int(os.environ.get("POLL_INTERVAL_SECONDS", "60"))
+LISTEN_ADDRESS = os.environ.get("LISTEN_ADDRESS", "0.0.0.0")
+LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8000"))
+
+PRIMARY_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN", "").strip()
+FALLBACK_TOKEN = os.environ.get("POSTMARK_SERVER_TOKEN_FALLBACK", "").strip()
+LIMIT_WINDOW = os.environ.get("POSTMARK_SENDING_LIMIT_WINDOW", "30d").strip()
+LIMIT_RAW = os.environ.get("POSTMARK_SENDING_LIMIT", "").strip()
+try:
+    SENDING_LIMIT = float(LIMIT_RAW) if LIMIT_RAW else 0.0
+except ValueError:
+    SENDING_LIMIT = 0.0
+
+EXPORTER_INFO = Info("postmark_exporter", "Exporter build info")
+EXPORTER_INFO.info(
+    {
+        "api_base": API_BASE,
+        "windows": ",".join(window.label for window in WINDOWS),
+    }
+)
+
+POSTMARK_API_UP = Gauge("postmark_api_up", "Whether Postmark API is reachable (1) or not (0)")
+POSTMARK_LAST_SUCCESS = Gauge(
+    "postmark_last_success_timestamp_seconds",
+    "Unix timestamp of the last successful Postmark stats refresh",
+)
+POSTMARK_REQUEST_ERRORS = Gauge(
+    "postmark_request_errors_total",
+    "Total Postmark stats request errors since exporter start",
+)
+
+POSTMARK_OUTBOUND_SENT = Gauge(
+    "postmark_outbound_sent",
+    "Outbound emails sent within the selected window",
+    labelnames=("window",),
+)
+POSTMARK_OUTBOUND_BOUNCED = Gauge(
+    "postmark_outbound_bounced",
+    "Outbound emails bounced within the selected window",
+    labelnames=("window",),
+)
+POSTMARK_OUTBOUND_BOUNCE_RATE = Gauge(
+    "postmark_outbound_bounce_rate",
+    "Outbound bounce rate percentage within the selected window",
+    labelnames=("window",),
+)
+POSTMARK_SENDING_LIMIT_GAUGE = Gauge(
+    "postmark_sending_limit",
+    "Configured Postmark sending limit for the active account",
+)
+POSTMARK_SENDING_LIMIT_USED = Gauge(
+    "postmark_sending_limit_used",
+    "Messages sent within the configured send limit window",
+)
+POSTMARK_SENDING_LIMIT_USED_PERCENT = Gauge(
+    "postmark_sending_limit_used_percent",
+    "Percent of the configured send limit used within the limit window",
+)
+
+
+def fetch_outbound_stats(token: str, window: Window) -> dict:
+    today = dt.date.today()
+    fromdate = today - dt.timedelta(days=window.days)
+    params = {"fromdate": fromdate.isoformat(), "todate": today.isoformat()}
+    headers = {
+        "Accept": "application/json",
+        "X-Postmark-Server-Token": token,
+    }
+    response = requests.get(
+        f"{API_BASE}/stats/outbound",
+        headers=headers,
+        params=params,
+        timeout=15,
+    )
+    response.raise_for_status()
+    return response.json()
+
+
+def update_metrics(token: str) -> None:
+    sent_by_window = {}
+    for window in WINDOWS:
+        data = fetch_outbound_stats(token, window)
+        sent = int(data.get("Sent", 0) or 0)
+        bounced = int(data.get("Bounced", 0) or 0)
+        rate = (bounced / sent * 100.0) if sent else 0.0
+        sent_by_window[window.label] = sent
+        POSTMARK_OUTBOUND_SENT.labels(window=window.label).set(sent)
+        POSTMARK_OUTBOUND_BOUNCED.labels(window=window.label).set(bounced)
+        POSTMARK_OUTBOUND_BOUNCE_RATE.labels(window=window.label).set(rate)
+
+    POSTMARK_SENDING_LIMIT_GAUGE.set(SENDING_LIMIT)
+    limit_window_sent = sent_by_window.get(LIMIT_WINDOW, 0)
+    POSTMARK_SENDING_LIMIT_USED.set(limit_window_sent)
+    if SENDING_LIMIT:
+        POSTMARK_SENDING_LIMIT_USED_PERCENT.set(limit_window_sent / SENDING_LIMIT * 100.0)
+    else:
+        POSTMARK_SENDING_LIMIT_USED_PERCENT.set(0.0)
+
+
+def main() -> None:
+    if not PRIMARY_TOKEN and not FALLBACK_TOKEN:
+        raise SystemExit("POSTMARK_SERVER_TOKEN or POSTMARK_SERVER_TOKEN_FALLBACK is required")
+
+    start_http_server(LISTEN_PORT, addr=LISTEN_ADDRESS)
+
+    tokens = [token for token in (PRIMARY_TOKEN, FALLBACK_TOKEN) if token]
+    token_index = 0
+
+    while True:
+        token = tokens[token_index % len(tokens)]
+        token_index += 1
+        try:
+            update_metrics(token)
+            POSTMARK_API_UP.set(1)
+            POSTMARK_LAST_SUCCESS.set(time.time())
+        except Exception as exc:  # noqa: BLE001
+            POSTMARK_API_UP.set(0)
+            POSTMARK_REQUEST_ERRORS.inc()
+            print(f"postmark_exporter: refresh failed: {exc}", flush=True)
+        time.sleep(POLL_INTERVAL_SECONDS)
+
+
+if __name__ == "__main__":
+    main()