atlasbot: shift to facts context and upgrade model

2026-01-27 06:28:03 -03:00 · 2026-01-27 06:28:03 -03:00 · a2f4c51e1d
commit a2f4c51e1d
parent 4fcecc4707
3 changed files with 151 additions and 60 deletions
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@ -20,7 +20,7 @@ spec:
      labels:
        app: ollama
      annotations:
-        ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0
+        ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0
        ai.bstein.dev/gpu: GPU pool (titan-22/24)
        ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
    spec:
@ -52,7 +52,7 @@ spec:
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
-              value: qwen2.5:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct-q4_0
          command:
            - /bin/sh
            - -c
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@ -16,7 +16,7 @@ spec:
      labels:
        app: atlasbot
      annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-29
+        checksum/atlasbot-configmap: manual-atlasbot-30
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "comms"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
@ -82,7 +82,7 @@ spec:
            - name: OLLAMA_URL
              value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/
            - name: OLLAMA_MODEL
-              value: qwen2.5:7b-instruct-q4_0
+              value: qwen2.5:14b-instruct-q4_0
            - name: OLLAMA_TIMEOUT_SEC
              value: "600"
            - name: ATLASBOT_THINKING_INTERVAL_SEC
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -33,7 +33,10 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
 MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
 MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
 THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
 OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
 OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
 TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE)
 HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)")
@ -113,6 +116,8 @@ METRIC_HINTS = {
    "connections": ("connections", "conn", "postgres", "database", "db"),
 }
 _OLLAMA_LOCK = threading.Lock()
 HARDWARE_HINTS = {
    "amd64": ("amd64", "x86", "x86_64", "x86-64"),
    "jetson": ("jetson",),
@ -638,6 +643,105 @@ def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
        return node_inventory()
    return []
 def _nodes_by_arch(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
    grouped: dict[str, list[str]] = collections.defaultdict(list)
    for node in inventory:
        grouped[(node.get("arch") or "unknown")].append(node["name"])
    return {k: sorted(v) for k, v in grouped.items()}
 def _node_usage_table(metrics: dict[str, Any]) -> list[dict[str, Any]]:
    usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
    per_node: dict[str, dict[str, Any]] = {}
    for metric_name, entries in usage.items() if isinstance(usage, dict) else []:
        if not isinstance(entries, list):
            continue
        for entry in entries:
            if not isinstance(entry, dict):
                continue
            node = entry.get("node")
            if not isinstance(node, str) or not node:
                continue
            per_node.setdefault(node, {})[metric_name] = entry.get("value")
    return [{"node": node, **vals} for node, vals in sorted(per_node.items())]
 def _workloads_for_facts(workloads: list[dict[str, Any]], limit: int = 25) -> list[dict[str, Any]]:
    cleaned: list[dict[str, Any]] = []
    for entry in workloads:
        if not isinstance(entry, dict):
            continue
        cleaned.append(
            {
                "namespace": entry.get("namespace"),
                "workload": entry.get("workload"),
                "pods_total": entry.get("pods_total"),
                "pods_running": entry.get("pods_running"),
                "primary_node": entry.get("primary_node"),
                "nodes": entry.get("nodes"),
            }
        )
    cleaned.sort(
        key=lambda item: (
            -(item.get("pods_total") or 0),
            str(item.get("namespace") or ""),
            str(item.get("workload") or ""),
        )
    )
    return cleaned[:limit]
 def facts_context(
    prompt: str,
    *,
    inventory: list[dict[str, Any]] | None,
    snapshot: dict[str, Any] | None,
    workloads: list[dict[str, Any]] | None,
 ) -> str:
    inv = inventory or []
    metrics = _snapshot_metrics(snapshot)
    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
    expected_workers = expected_worker_nodes_from_metrics()
    ready_workers, not_ready_workers = worker_nodes_status(inv) if inv else ([], [])
    facts: dict[str, Any] = {
        "generated_at": snapshot.get("generated_at") if isinstance(snapshot, dict) else None,
        "nodes": {
            "total": summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total"),
            "ready": summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready"),
            "not_ready": summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready"),
            "not_ready_names": summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names"),
            "by_hardware": _group_nodes(inv) if inv else {},
            "by_arch": _nodes_by_arch(inv) if inv else {},
            "workers_ready": ready_workers,
            "workers_not_ready": not_ready_workers,
            "expected_workers": expected_workers,
        },
        "metrics": {
            "hottest_nodes": metrics.get("hottest_nodes") if isinstance(metrics, dict) else {},
            "postgres_connections": metrics.get("postgres_connections") if isinstance(metrics, dict) else {},
            "node_usage": _node_usage_table(metrics) if isinstance(metrics, dict) else [],
        },
        "workloads": _workloads_for_facts(workloads or []),
    }
    rendered = json.dumps(facts, ensure_ascii=False)
    if len(rendered) <= MAX_FACTS_CHARS:
        return "Facts (live snapshot):\n" + rendered
    trimmed = dict(facts)
    trimmed.pop("workloads", None)
    rendered = json.dumps(trimmed, ensure_ascii=False)
    if len(rendered) <= MAX_FACTS_CHARS:
        return "Facts (live snapshot):\n" + rendered
    trimmed_metrics = dict(trimmed.get("metrics") or {})
    trimmed_metrics.pop("node_usage", None)
    trimmed["metrics"] = trimmed_metrics
    rendered = json.dumps(trimmed, ensure_ascii=False)
    if len(rendered) <= MAX_FACTS_CHARS:
        return "Facts (live snapshot):\n" + rendered
    return "Facts (live snapshot):\n" + rendered[:MAX_FACTS_CHARS]
 def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
    names = [node["name"] for node in inventory]
    ready = [node["name"] for node in inventory if node.get("ready") is True]
@ -1463,26 +1567,19 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
        snapshot = _snapshot_state()
        inventory = _snapshot_inventory(snapshot) or node_inventory_live()
        workloads = _snapshot_workloads(snapshot)
-        answer = structured_answer(
+        context = build_context(
            prompt,
            allow_tools=False,
            targets=[],
            inventory=inventory,
            metrics_summary="",
            snapshot=snapshot,
            workloads=workloads,
        )
-        if not answer and _knowledge_intent(prompt):
+        metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True)
-            answer = knowledge_summary(prompt, inventory)
+        if metrics_context:
-        if not answer:
+            context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
-            kb = kb_retrieve_titles(prompt, limit=4)
+        fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
-            context = build_context(
+        answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
                prompt,
                allow_tools=False,
                targets=[],
                inventory=inventory,
                snapshot=snapshot,
            )
            fallback = kb or "I don't have enough data to answer that."
            answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
        self._write_json(200, {"answer": answer})
@ -1505,10 +1602,13 @@ def build_context(
    targets: list[tuple[str, str]],
    inventory: list[dict[str, Any]] | None = None,
    snapshot: dict[str, Any] | None = None,
    workloads: list[dict[str, Any]] | None = None,
 ) -> str:
    parts: list[str] = []
    kb = kb_retrieve(prompt)
    if not kb and _knowledge_intent(prompt):
        kb = kb_retrieve_titles(prompt, limit=4)
    if kb:
        parts.append(kb)
@ -1516,13 +1616,9 @@ def build_context(
    if endpoints:
        parts.append(endpoints)
-    node_ctx = node_inventory_context(prompt, inventory)
+    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
-    if node_ctx:
+    if facts:
-        parts.append(node_ctx)
+        parts.append(facts)
    snapshot_ctx = snapshot_context(prompt, snapshot)
    if snapshot_ctx:
        parts.append(snapshot_ctx)
    if allow_tools:
        # Scope pod summaries to relevant namespaces/workloads when possible.
@ -1627,7 +1723,9 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
    system = (
        "System: You are Atlas, the Titan lab assistant for Atlas/Othrys. "
        "Be helpful, direct, and concise. "
-        "Prefer answering with exact repo paths and Kubernetes resource names. "
+        "Use the provided context and facts as your source of truth. "
        "If you infer or synthesize, say 'Based on the snapshot' and keep it brief. "
        "Prefer exact repo paths and Kubernetes resource names when relevant. "
        "Never include or request secret values. "
        "Do not suggest commands unless explicitly asked. "
        "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
@ -1646,21 +1744,32 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
    if API_KEY:
        headers["x-api-key"] = API_KEY
    r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-    with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
+    lock = _OLLAMA_LOCK if OLLAMA_SERIALIZE else None
-        data = json.loads(resp.read().decode())
+    if lock:
-        raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
+        lock.acquire()
-        reply = _normalize_reply(raw_reply) or "I'm here to help."
+    try:
-    history[hist_key].append(f"Atlas: {reply}")
+        with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
-    return reply
+            data = json.loads(resp.read().decode())
            raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
            reply = _normalize_reply(raw_reply) or "I'm here to help."
        history[hist_key].append(f"Atlas: {reply}")
        return reply
    finally:
        if lock:
            lock.release()
 def ollama_reply(hist_key, prompt: str, *, context: str, fallback: str = "") -> str:
-    try:
+    last_error = None
-        return _ollama_call(hist_key, prompt, context=context)
+    for attempt in range(max(1, OLLAMA_RETRIES + 1)):
-    except Exception:
+        try:
-        if fallback:
+            return _ollama_call(hist_key, prompt, context=context)
-            history[hist_key].append(f"Atlas: {fallback}")
+        except Exception as exc:  # noqa: BLE001
-            return fallback
+            last_error = exc
-        return "Model backend is busy. Try again in a moment."
+            time.sleep(min(4, 2 ** attempt))
    if fallback:
        history[hist_key].append(f"Atlas: {fallback}")
        return fallback
    return "I don't have enough data to answer that."
 def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, context: str, fallback: str) -> str:
    result: dict[str, str] = {"reply": ""}
@ -1774,6 +1883,7 @@ def sync_loop(token: str, room_id: str):
                    targets=targets,
                    inventory=inventory,
                    snapshot=snapshot,
                    workloads=workloads,
                )
                if allow_tools and promql:
                    res = vm_query(promql, timeout=20)
@ -1784,26 +1894,7 @@ def sync_loop(token: str, room_id: str):
                if metrics_context:
                    context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
-                fallback = metrics_fallback or ""
+                fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
                if not fallback and context:
                    fallback = _context_fallback(context)
                structured = structured_answer(
                    body,
                    inventory=inventory,
                    metrics_summary=metrics_fallback or "",
                    snapshot=snapshot,
                    workloads=workloads,
                )
                if structured:
                    send_msg(token, rid, structured)
                    continue
                if _knowledge_intent(body):
                    summary = knowledge_summary(body, inventory)
                    if summary:
                        send_msg(token, rid, summary)
                        continue
                reply = ollama_reply_with_thinking(
                    token,