bstein-dev-home/backend/atlas_portal/routes/ai.py

from __future__ import annotations

import json
import threading
import time
from pathlib import Path
from typing import Any

from flask import jsonify, request
import httpx

from .. import settings


def register(app) -> None:
    """Register the Atlas AI chat and model-info endpoints."""

    @app.route("/api/chat", methods=["POST"])
    @app.route("/api/ai/chat", methods=["POST"])
    def ai_chat() -> Any:
        """Return an Atlasbot answer or a budget-aware fallback message."""

        payload = request.get_json(silent=True) or {}
        user_message = (payload.get("message") or "").strip()
        profile = (payload.get("profile") or payload.get("mode") or "atlas-quick").strip().lower()
        conversation_id = payload.get("conversation_id") if isinstance(payload.get("conversation_id"), str) else ""

        if not user_message:
            return jsonify({"error": "message required"}), 400

        started = time.time()
        mode = "quick"
        if profile in {"atlas-smart", "smart"}:
            mode = "smart"
        elif profile in {"atlas-genius", "genius"}:
            mode = "genius"
        reply = _atlasbot_answer(user_message, mode, conversation_id)
        source = f"atlas-{mode}"
        if reply:
            elapsed_ms = int((time.time() - started) * 1000)
            return jsonify({"reply": reply, "latency_ms": elapsed_ms, "source": source})
        elapsed_ms = int((time.time() - started) * 1000)
        if mode == "quick":
            budget = max(1, int(round(settings.AI_ATLASBOT_TIMEOUT_QUICK_SEC)))
            fallback = (
                f"Quick mode hit {budget}s response budget before finishing. "
                "Try atlas-smart for a deeper answer."
            )
        elif mode == "smart":
            budget = max(1, int(round(settings.AI_ATLASBOT_TIMEOUT_SMART_SEC)))
            fallback = (
                f"Smart mode hit {budget}s response budget before finishing. "
                "Try atlas-genius or ask a narrower follow-up."
            )
        else:
            fallback = "Atlas genius mode timed out before it could finish. Please retry with a narrower prompt."
        return jsonify(
            {
                "reply": fallback,
                "latency_ms": elapsed_ms,
                "source": source,
            }
        )

    @app.route("/api/chat/info", methods=["GET"])
    @app.route("/api/ai/info", methods=["GET"])
    def ai_info() -> Any:
        """Return model and placement metadata for the requested AI profile."""

        profile = (request.args.get("profile") or "atlas-quick").strip().lower()
        meta = _discover_ai_meta(profile)
        return jsonify(meta)

    _start_keep_warm()


def _atlasbot_answer(message: str, mode: str, conversation_id: str) -> str:
    """Ask Atlasbot for one answer and return an empty string on soft failure."""

    endpoint = settings.AI_ATLASBOT_ENDPOINT
    if not endpoint:
        return ""
    headers: dict[str, str] = {}
    if settings.AI_ATLASBOT_TOKEN:
        headers["X-Internal-Token"] = settings.AI_ATLASBOT_TOKEN
    try:
        payload = {"prompt": message, "mode": mode}
        if conversation_id:
            payload["conversation_id"] = conversation_id
        with httpx.Client(timeout=_atlasbot_timeout_sec(mode)) as client:
            resp = client.post(endpoint, json=payload, headers=headers)
            if resp.status_code != 200:
                return ""
            data = resp.json()
            answer = (data.get("reply") or data.get("answer") or "").strip()
            return answer
    except (httpx.RequestError, ValueError):
        return ""


def _atlasbot_timeout_sec(mode: str) -> float:
    if mode == "genius":
        return settings.AI_ATLASBOT_TIMEOUT_GENIUS_SEC
    if mode == "smart":
        return settings.AI_ATLASBOT_TIMEOUT_SMART_SEC
    return settings.AI_ATLASBOT_TIMEOUT_QUICK_SEC


def _discover_ai_meta(profile: str) -> dict[str, str]:
    """Discover AI model metadata from settings and the running Kubernetes pod.

    WHY: the frontend needs a human-readable model/GPU hint even when the model
    image or GPU placement changes outside the portal code.
    """

    meta = {
        "node": settings.AI_NODE_NAME,
        "gpu": settings.AI_GPU_DESC,
        "model": settings.AI_CHAT_MODEL,
        "endpoint": settings.AI_PUBLIC_ENDPOINT or "/api/chat",
        "profile": profile,
    }
    if profile in {"atlas-smart", "smart"}:
        meta["model"] = settings.AI_ATLASBOT_MODEL_SMART or settings.AI_CHAT_MODEL
        meta["endpoint"] = "/api/ai/chat"
    elif profile in {"atlas-genius", "genius"}:
        meta["model"] = settings.AI_ATLASBOT_MODEL_GENIUS or settings.AI_CHAT_MODEL
        meta["endpoint"] = "/api/ai/chat"
    elif profile in {"atlas-quick", "quick"}:
        meta["model"] = settings.AI_ATLASBOT_MODEL_FAST or settings.AI_CHAT_MODEL
        meta["endpoint"] = "/api/ai/chat"

    sa_path = Path("/var/run/secrets/kubernetes.io/serviceaccount")
    token_path = sa_path / "token"
    ca_path = sa_path / "ca.crt"
    ns_path = sa_path / "namespace"
    if not token_path.exists() or not ca_path.exists() or not ns_path.exists():
        return meta

    try:
        token = token_path.read_text().strip()
        namespace = settings.AI_K8S_NAMESPACE
        base_url = "https://kubernetes.default.svc"
        pod_url = f"{base_url}/api/v1/namespaces/{namespace}/pods?labelSelector={settings.AI_K8S_LABEL}"

        with httpx.Client(
            verify=str(ca_path),
            timeout=settings.HTTP_CHECK_TIMEOUT_SEC,
            headers={"Authorization": f"Bearer {token}"},
        ) as client:
            resp = client.get(pod_url)
            resp.raise_for_status()
            data = resp.json()
            items = data.get("items") or []
            running = [p for p in items if p.get("status", {}).get("phase") == "Running"] or items
            if running:
                pod = running[0]
                node_name = pod.get("spec", {}).get("nodeName") or meta["node"]
                meta["node"] = node_name

                annotations = pod.get("metadata", {}).get("annotations") or {}
                gpu_hint = (
                    annotations.get(settings.AI_GPU_ANNOTATION)
                    or annotations.get("ai.gpu/description")
                    or annotations.get("gpu/description")
                )
                if gpu_hint:
                    meta["gpu"] = gpu_hint

                model_hint = annotations.get(settings.AI_MODEL_ANNOTATION)
                if not model_hint:
                    containers = pod.get("spec", {}).get("containers") or []
                    if containers:
                        image = containers[0].get("image") or ""
                        model_hint = image.split(":")[-1] if ":" in image else image
                if model_hint:
                    meta["model"] = model_hint
    except Exception:
        pass

    return meta


def _start_keep_warm() -> None:
    """Start the optional background keep-warm loop for the chat backend."""

    if not settings.AI_WARM_ENABLED or settings.AI_WARM_INTERVAL_SEC <= 0:
        return

    def loop() -> None:
        """Periodically send a tiny chat request so the backend stays warm."""

        while True:
            time.sleep(settings.AI_WARM_INTERVAL_SEC)
            try:
                body = {
                    "model": settings.AI_CHAT_MODEL,
                    "messages": [{"role": "user", "content": "ping"}],
                    "stream": False,
                }
                with httpx.Client(timeout=min(settings.AI_CHAT_TIMEOUT_SEC, 15)) as client:
                    client.post(f"{settings.AI_CHAT_API}/api/chat", json=body)
            except Exception:
                continue

    threading.Thread(target=loop, daemon=True, name="ai-keep-warm").start()
portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`from __future__ import annotations`

			`import json`
			`import threading`
			`import time`
			`from pathlib import Path`
			`from typing import Any`

			`from flask import jsonify, request`
			`import httpx`

			`from .. import settings`


			`def register(app) -> None:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Register the Atlas AI chat and model-info endpoints."""`

portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`@app.route("/api/chat", methods=["POST"])`
			`@app.route("/api/ai/chat", methods=["POST"])`
			`def ai_chat() -> Any:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Return an Atlasbot answer or a budget-aware fallback message."""`

portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`payload = request.get_json(silent=True) or {}`
			`user_message = (payload.get("message") or "").strip()`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`profile = (payload.get("profile") or payload.get("mode") or "atlas-quick").strip().lower()`
ai: persist conversation id 2026-01-30 16:59:06 -03:00			`conversation_id = payload.get("conversation_id") if isinstance(payload.get("conversation_id"), str) else ""`
portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00
			`if not user_message:`
			`return jsonify({"error": "message required"}), 400`

fix(ai): set latency timer before atlasbot call 2026-01-26 23:49:07 -03:00			`started = time.time()`
ai: align web chat modes with atlasbot and remove stock path 2026-03-30 02:52:54 -03:00			`mode = "quick"`
			`if profile in {"atlas-smart", "smart"}:`
			`mode = "smart"`
			`elif profile in {"atlas-genius", "genius"}:`
			`mode = "genius"`
			`reply = _atlasbot_answer(user_message, mode, conversation_id)`
			`source = f"atlas-{mode}"`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`if reply:`
ai: use atlasbot internal answers before LLM 2026-01-26 22:44:16 -03:00			`elapsed_ms = int((time.time() - started) * 1000)`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`return jsonify({"reply": reply, "latency_ms": elapsed_ms, "source": source})`
ai: route all chat through atlasbot 2026-01-27 14:54:35 -03:00			`elapsed_ms = int((time.time() - started) * 1000)`
ai: return mode-specific timeout guidance when atlasbot misses SLA 2026-03-30 03:53:42 -03:00			`if mode == "quick":`
			`budget = max(1, int(round(settings.AI_ATLASBOT_TIMEOUT_QUICK_SEC)))`
			`fallback = (`
			`f"Quick mode hit {budget}s response budget before finishing. "`
			`"Try atlas-smart for a deeper answer."`
			`)`
			`elif mode == "smart":`
			`budget = max(1, int(round(settings.AI_ATLASBOT_TIMEOUT_SMART_SEC)))`
			`fallback = (`
			`f"Smart mode hit {budget}s response budget before finishing. "`
			`"Try atlas-genius or ask a narrower follow-up."`
			`)`
			`else:`
			`fallback = "Atlas genius mode timed out before it could finish. Please retry with a narrower prompt."`
ai: route all chat through atlasbot 2026-01-27 14:54:35 -03:00			`return jsonify(`
			`{`
ai: return mode-specific timeout guidance when atlasbot misses SLA 2026-03-30 03:53:42 -03:00			`"reply": fallback,`
ai: route all chat through atlasbot 2026-01-27 14:54:35 -03:00			`"latency_ms": elapsed_ms,`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`"source": source,`
ai: route all chat through atlasbot 2026-01-27 14:54:35 -03:00			`}`
			`)`
portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00
			`@app.route("/api/chat/info", methods=["GET"])`
			`@app.route("/api/ai/info", methods=["GET"])`
			`def ai_info() -> Any:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Return model and placement metadata for the requested AI profile."""`

portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`profile = (request.args.get("profile") or "atlas-quick").strip().lower()`
			`meta = _discover_ai_meta(profile)`
portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`return jsonify(meta)`

			`_start_keep_warm()`


ai: persist conversation id 2026-01-30 16:59:06 -03:00			`def _atlasbot_answer(message: str, mode: str, conversation_id: str) -> str:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Ask Atlasbot for one answer and return an empty string on soft failure."""`

ai: use atlasbot internal answers before LLM 2026-01-26 22:44:16 -03:00			`endpoint = settings.AI_ATLASBOT_ENDPOINT`
			`if not endpoint:`
			`return ""`
			`headers: dict[str, str] = {}`
			`if settings.AI_ATLASBOT_TOKEN:`
			`headers["X-Internal-Token"] = settings.AI_ATLASBOT_TOKEN`
			`try:`
ai: persist conversation id 2026-01-30 16:59:06 -03:00			`payload = {"prompt": message, "mode": mode}`
			`if conversation_id:`
			`payload["conversation_id"] = conversation_id`
ai: enforce mode timeouts and accept atlasbot reply payload 2026-03-30 03:37:33 -03:00			`with httpx.Client(timeout=_atlasbot_timeout_sec(mode)) as client:`
ai: persist conversation id 2026-01-30 16:59:06 -03:00			`resp = client.post(endpoint, json=payload, headers=headers)`
ai: use atlasbot internal answers before LLM 2026-01-26 22:44:16 -03:00			`if resp.status_code != 200:`
			`return ""`
			`data = resp.json()`
ai: enforce mode timeouts and accept atlasbot reply payload 2026-03-30 03:37:33 -03:00			`answer = (data.get("reply") or data.get("answer") or "").strip()`
ai: use atlasbot internal answers before LLM 2026-01-26 22:44:16 -03:00			`return answer`
			`except (httpx.RequestError, ValueError):`
			`return ""`

portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00
ai: enforce mode timeouts and accept atlasbot reply payload 2026-03-30 03:37:33 -03:00			`def _atlasbot_timeout_sec(mode: str) -> float:`
			`if mode == "genius":`
			`return settings.AI_ATLASBOT_TIMEOUT_GENIUS_SEC`
			`if mode == "smart":`
			`return settings.AI_ATLASBOT_TIMEOUT_SMART_SEC`
			`return settings.AI_ATLASBOT_TIMEOUT_QUICK_SEC`


portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`def _discover_ai_meta(profile: str) -> dict[str, str]:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Discover AI model metadata from settings and the running Kubernetes pod.`

			`WHY: the frontend needs a human-readable model/GPU hint even when the model`
			`image or GPU placement changes outside the portal code.`
			`"""`

portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`meta = {`
			`"node": settings.AI_NODE_NAME,`
			`"gpu": settings.AI_GPU_DESC,`
			`"model": settings.AI_CHAT_MODEL,`
			`"endpoint": settings.AI_PUBLIC_ENDPOINT or "/api/chat",`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`"profile": profile,`
portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`}`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`if profile in {"atlas-smart", "smart"}:`
			`meta["model"] = settings.AI_ATLASBOT_MODEL_SMART or settings.AI_CHAT_MODEL`
			`meta["endpoint"] = "/api/ai/chat"`
ai: align web chat modes with atlasbot and remove stock path 2026-03-30 02:52:54 -03:00			`elif profile in {"atlas-genius", "genius"}:`
			`meta["model"] = settings.AI_ATLASBOT_MODEL_GENIUS or settings.AI_CHAT_MODEL`
			`meta["endpoint"] = "/api/ai/chat"`
portal: add atlasbot profiles 2026-01-28 13:01:54 -03:00			`elif profile in {"atlas-quick", "quick"}:`
			`meta["model"] = settings.AI_ATLASBOT_MODEL_FAST or settings.AI_CHAT_MODEL`
			`meta["endpoint"] = "/api/ai/chat"`
portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00
			`sa_path = Path("/var/run/secrets/kubernetes.io/serviceaccount")`
			`token_path = sa_path / "token"`
			`ca_path = sa_path / "ca.crt"`
			`ns_path = sa_path / "namespace"`
			`if not token_path.exists() or not ca_path.exists() or not ns_path.exists():`
			`return meta`

			`try:`
			`token = token_path.read_text().strip()`
			`namespace = settings.AI_K8S_NAMESPACE`
			`base_url = "https://kubernetes.default.svc"`
			`pod_url = f"{base_url}/api/v1/namespaces/{namespace}/pods?labelSelector={settings.AI_K8S_LABEL}"`

			`with httpx.Client(`
			`verify=str(ca_path),`
			`timeout=settings.HTTP_CHECK_TIMEOUT_SEC,`
			`headers={"Authorization": f"Bearer {token}"},`
			`) as client:`
			`resp = client.get(pod_url)`
			`resp.raise_for_status()`
			`data = resp.json()`
			`items = data.get("items") or []`
			`running = [p for p in items if p.get("status", {}).get("phase") == "Running"] or items`
			`if running:`
			`pod = running[0]`
			`node_name = pod.get("spec", {}).get("nodeName") or meta["node"]`
			`meta["node"] = node_name`

			`annotations = pod.get("metadata", {}).get("annotations") or {}`
			`gpu_hint = (`
			`annotations.get(settings.AI_GPU_ANNOTATION)`
			`or annotations.get("ai.gpu/description")`
			`or annotations.get("gpu/description")`
			`)`
			`if gpu_hint:`
			`meta["gpu"] = gpu_hint`

			`model_hint = annotations.get(settings.AI_MODEL_ANNOTATION)`
			`if not model_hint:`
			`containers = pod.get("spec", {}).get("containers") or []`
			`if containers:`
			`image = containers[0].get("image") or ""`
			`model_hint = image.split(":")[-1] if ":" in image else image`
			`if model_hint:`
			`meta["model"] = model_hint`
			`except Exception:`
			`pass`

			`return meta`


			`def _start_keep_warm() -> None:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Start the optional background keep-warm loop for the chat backend."""`

portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`if not settings.AI_WARM_ENABLED or settings.AI_WARM_INTERVAL_SEC <= 0:`
			`return`

			`def loop() -> None:`
docs(bstein-home): document full source gate surface 2026-04-21 07:25:40 -03:00			`"""Periodically send a tiny chat request so the backend stays warm."""`

portal: modularize backend and add request code status 2026-01-01 23:17:19 -03:00			`while True:`
			`time.sleep(settings.AI_WARM_INTERVAL_SEC)`
			`try:`
			`body = {`
			`"model": settings.AI_CHAT_MODEL,`
			`"messages": [{"role": "user", "content": "ping"}],`
			`"stream": False,`
			`}`
			`with httpx.Client(timeout=min(settings.AI_CHAT_TIMEOUT_SEC, 15)) as client:`
			`client.post(f"{settings.AI_CHAT_API}/api/chat", json=body)`
			`except Exception:`
			`continue`

			`threading.Thread(target=loop, daemon=True, name="ai-keep-warm").start()`