atlasbot: strengthen facts context and replies

2026-01-27 11:03:55 -03:00 · 2026-01-27 11:03:55 -03:00 · a442ea6d5d
commit a442ea6d5d
parent c0dd00c93d
1 changed files with 68 additions and 23 deletions
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -34,6 +34,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev")
 MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500"))
 MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500"))
 MAX_FACTS_CHARS = int(os.environ.get("ATLASBOT_MAX_FACTS_CHARS", "8000"))
+MAX_CONTEXT_CHARS = int(os.environ.get("ATLASBOT_MAX_CONTEXT_CHARS", "12000"))
 THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120"))
 OLLAMA_RETRIES = int(os.environ.get("ATLASBOT_OLLAMA_RETRIES", "2"))
 OLLAMA_SERIALIZE = os.environ.get("ATLASBOT_OLLAMA_SERIALIZE", "true").lower() != "false"
@ -100,6 +101,7 @@ CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
 TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
 TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
 _DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
+CONFIDENCE_RE = re.compile(r"confidence\s*:\s*(high|medium|low)\b", re.IGNORECASE)

 OPERATION_HINTS = {
    "count": ("how many", "count", "number", "total"),
@ -139,6 +141,20 @@ def _tokens(text: str) -> list[str]:
    return [t for t in toks if t not in STOPWORDS and len(t) >= 2]


+def _ensure_confidence(text: str) -> str:
+    if not text:
+        return ""
+    lines = text.strip().splitlines()
+    for idx, line in enumerate(lines):
+        match = CONFIDENCE_RE.search(line)
+        if match:
+            level = match.group(1).lower()
+            lines[idx] = f"Confidence: {level}"
+            return "\n".join(lines)
+    lines.append("Confidence: medium")
+    return "\n".join(lines)
+
+
 # Mention detection (Matrix rich mentions + plain @atlas).
 MENTION_TOKENS = [m.strip() for m in BOT_MENTIONS.split(",") if m.strip()]
 MENTION_LOCALPARTS = [m.lstrip("@").split(":", 1)[0] for m in MENTION_TOKENS]
@ -710,6 +726,7 @@ def facts_context(
    workloads: list[dict[str, Any]] | None,
 ) -> str:
    inv = inventory or []
+    nodes_in_query = _extract_titan_nodes(prompt)
    metrics = _snapshot_metrics(snapshot)
    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
@ -721,6 +738,12 @@ def facts_context(
    not_ready_names = summary.get("not_ready_names") if isinstance(summary, dict) else nodes.get("not_ready_names")
    by_hardware = _group_nodes(inv) if inv else {}
    by_arch = _nodes_by_arch(inv) if inv else {}
+    control_plane_nodes = [
+        node["name"]
+        for node in inv
+        if any(role in ("control-plane", "master") for role in (node.get("roles") or []))
+    ]
+    worker_nodes = [node["name"] for node in inv if node.get("is_worker") is True]

    lines: list[str] = ["Facts (live snapshot):"]
    if total is not None:
@ -731,9 +754,16 @@ def facts_context(
        nodes_list = by_hardware.get(key) or []
        if nodes_list:
            lines.append(f"- {key}: {', '.join(nodes_list)}")
+    non_rpi = sorted(set(by_hardware.get("jetson", [])) | set(by_hardware.get("amd64", [])))
+    if non_rpi:
+        lines.append(f"- non_raspberry_pi: {', '.join(non_rpi)}")
    for key, nodes_list in sorted(by_arch.items()):
        if nodes_list:
            lines.append(f"- arch {key}: {', '.join(nodes_list)}")
+    if control_plane_nodes:
+        lines.append(f"- control_plane_nodes: {', '.join(control_plane_nodes)}")
+    if worker_nodes:
+        lines.append(f"- worker_nodes: {', '.join(worker_nodes)}")
    if ready_workers or not_ready_workers:
        lines.append(f"- workers_ready: {', '.join(ready_workers) if ready_workers else 'none'}")
        if not_ready_workers:
@ -753,7 +783,8 @@ def facts_context(
        node = entry.get("node")
        value = entry.get("value")
        if node and value is not None:
-            lines.append(f"- hottest_{key}: {node} ({value})")
+            value_fmt = _format_metric_value(str(value), percent=key in ("cpu", "ram"))
+            lines.append(f"- hottest_{key}: {node} ({value_fmt})")

    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
    if isinstance(postgres, dict) and postgres:
@ -774,12 +805,25 @@ def facts_context(
            node = entry.get("node")
            if not node:
                continue
-            cpu = entry.get("cpu")
-            ram = entry.get("ram")
-            net = entry.get("net")
-            io_val = entry.get("io")
+            cpu = _format_metric_value(str(entry.get("cpu")), percent=True) if entry.get("cpu") is not None else ""
+            ram = _format_metric_value(str(entry.get("ram")), percent=True) if entry.get("ram") is not None else ""
+            net = _format_metric_value(str(entry.get("net")), percent=False) if entry.get("net") is not None else ""
+            io_val = _format_metric_value(str(entry.get("io")), percent=False) if entry.get("io") is not None else ""
            lines.append(f"  - {node}: cpu={cpu}, ram={ram}, net={net}, io={io_val}")

+    if nodes_in_query:
+        lines.append("- node_details:")
+        for name in nodes_in_query:
+            detail = next((n for n in inv if n.get("name") == name), None)
+            if not detail:
+                lines.append(f"  - {name}: not found in snapshot")
+                continue
+            roles = ",".join(detail.get("roles") or []) or "none"
+            lines.append(
+                f"  - {name}: hardware={detail.get('hardware')}, arch={detail.get('arch')}, "
+                f"ready={detail.get('ready')}, roles={roles}"
+            )
+
    workload_entries = _workloads_for_prompt(prompt, workloads or [])
    if workload_entries:
        lines.append("- workloads:")
@ -1181,11 +1225,10 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
        if rendered:
            rendered_parts.append(rendered)
    if not rendered_parts:
-        return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
+        return "", ""
    summary = "\n".join(rendered_parts)
    context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
-    fallback = _metrics_fallback_summary(panel, summary)
-    return context, fallback
+    return context, ""

 def catalog_hints(query: str) -> tuple[str, list[tuple[str, str]]]:
    q = (query or "").strip()
@ -1574,8 +1617,8 @@ def _normalize_reply(value: Any) -> str:
        try:
            return _normalize_reply(json.loads(text))
        except Exception:
-            return text
-    return text
+            return _ensure_confidence(text)
+    return _ensure_confidence(text)


 # Internal HTTP endpoint for cluster answers (website uses this).
@ -1634,10 +1677,10 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
            snapshot=snapshot,
            workloads=workloads,
        )
-        metrics_context, metrics_fallback = metrics_query_context(prompt, allow_tools=True)
+        metrics_context, _metrics_fallback = metrics_query_context(prompt, allow_tools=True)
        if metrics_context:
            context = (context + "\n\n" + metrics_context).strip() if context else metrics_context
-        fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
+        fallback = "I don't have enough data to answer that."
        answer = ollama_reply(("http", "internal"), prompt, context=context, fallback=fallback)
        self._write_json(200, {"answer": answer})

@ -1665,19 +1708,19 @@ def build_context(
 ) -> str:
    parts: list[str] = []

-    kb = kb_retrieve(prompt)
-    if not kb and _knowledge_intent(prompt):
-        kb = kb_retrieve_titles(prompt, limit=4)
-    if kb:
-        parts.append(kb)
+    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
+    if facts:
+        parts.append(facts)

    endpoints, edges = catalog_hints(prompt)
    if endpoints:
        parts.append(endpoints)

-    facts = facts_context(prompt, inventory=inventory, snapshot=snapshot, workloads=workloads)
-    if facts:
-        parts.append(facts)
+    kb = kb_retrieve(prompt)
+    if not kb and _knowledge_intent(prompt):
+        kb = kb_retrieve_titles(prompt, limit=4)
+    if kb:
+        parts.append(kb)

    if allow_tools:
        # Scope pod summaries to relevant namespaces/workloads when possible.
@ -1789,12 +1832,14 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str:
        "Never include or request secret values. "
        "Do not suggest commands unless explicitly asked. "
        "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. "
+        "Translate metrics into natural language instead of echoing raw label/value pairs. "
+        "Do not answer by only listing runbooks; summarize the cluster first and mention docs only if useful. "
        "If the answer is not grounded in the provided context or tool data, say you do not know. "
        "End every response with a line: 'Confidence: high|medium|low'."
    )
    transcript_parts = [system]
    if context:
-        transcript_parts.append("Context (grounded):\n" + context[:MAX_KB_CHARS])
+        transcript_parts.append("Context (grounded):\n" + context[:MAX_CONTEXT_CHARS])
    transcript_parts.extend(history[hist_key][-24:])
    transcript_parts.append(f"User: {prompt}")
    transcript = "\n".join(transcript_parts)
@ -1950,11 +1995,11 @@ def sync_loop(token: str, room_id: str):
                    rendered = vm_render_result(res, limit=15) or "(no results)"
                    extra = "VictoriaMetrics (PromQL result):\n" + rendered
                    context = (context + "\n\n" + extra).strip() if context else extra
-                metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
+                metrics_context, _metrics_fallback = metrics_query_context(body, allow_tools=allow_metrics)
                if metrics_context:
                    context = (context + "\n\n" + metrics_context).strip() if context else metrics_context

-                fallback = metrics_fallback or _context_fallback(context) or "I don't have enough data to answer that."
+                fallback = "I don't have enough data to answer that."

                reply = ollama_reply_with_thinking(
                    token,