atlasbot: answer cluster queries without llm

2026-01-27 15:30:43 -03:00 · 2026-01-27 15:30:43 -03:00 · b7792d30f1
commit b7792d30f1
parent 241a8889ee
2 changed files with 263 additions and 23 deletions
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@ -16,7 +16,7 @@ spec:
      labels:
        app: atlasbot
      annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-47
+        checksum/atlasbot-configmap: manual-atlasbot-48
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "comms"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -532,7 +532,7 @@ def _detect_role_filters(q: str) -> set[str]:
    return roles
 def _detect_entity(q: str) -> str | None:
-    if "node" in q or "nodes" in q or "worker" in q or TITAN_NODE_RE.search(q):
+    if "node" in q or "nodes" in q or "worker" in q or "hardware" in q or "architecture" in q or TITAN_NODE_RE.search(q):
        return "node"
    if "pod" in q or "pods" in q:
        return "pod"
@ -1152,6 +1152,15 @@ def snapshot_metric_answer(
            if include_hw:
                scope = f" among {' and '.join(sorted(include_hw))}"
            answer = f"Hottest node{scope}: {node} ({value})."
            if allowed_nodes and len(allowed_nodes) != len(inventory):
                overall = _node_usage_top(usage, allowed_nodes=None)
                if overall and overall[0] != node:
                    overall_val = _format_metric_value(
                        str(overall[1]),
                        percent=percent,
                        rate=metric in {"net", "io"},
                    )
                    answer += f" Overall hottest: {overall[0]} ({overall_val})."
            return _format_confidence(answer, "high")
    if metric == "connections" or "postgres" in q:
@ -1358,6 +1367,219 @@ def structured_answer(
    return ""
 def _nodes_summary_line(inventory: list[dict[str, Any]], snapshot: dict[str, Any] | None) -> str:
    summary = snapshot.get("nodes_summary") if isinstance(snapshot, dict) else {}
    nodes = snapshot.get("nodes") if isinstance(snapshot, dict) else {}
    total = summary.get("total") if isinstance(summary, dict) and summary.get("total") is not None else nodes.get("total")
    ready = summary.get("ready") if isinstance(summary, dict) and summary.get("ready") is not None else nodes.get("ready")
    not_ready = summary.get("not_ready") if isinstance(summary, dict) and summary.get("not_ready") is not None else nodes.get("not_ready")
    if total is None:
        total = len(inventory)
        ready = len([n for n in inventory if n.get("ready") is True])
        not_ready = len([n for n in inventory if n.get("ready") is False])
    if total is None:
        return ""
    return f"Atlas cluster has {total} nodes ({ready} ready, {not_ready} not ready)."
 def _hardware_mix_line(inventory: list[dict[str, Any]]) -> str:
    if not inventory:
        return ""
    groups = _group_nodes(inventory)
    parts: list[str] = []
    for key in ("rpi5", "rpi4", "jetson", "amd64", "arm64-unknown", "unknown"):
        nodes = groups.get(key) or []
        if nodes:
            parts.append(f"{key}={len(nodes)}")
    if not parts:
        return ""
    return "Hardware mix: " + ", ".join(parts) + "."
 def _os_mix_line(snapshot: dict[str, Any] | None) -> str:
    if not snapshot:
        return ""
    details = snapshot.get("nodes_detail") if isinstance(snapshot.get("nodes_detail"), list) else []
    counts: dict[str, int] = collections.Counter()
    for node in details:
        if not isinstance(node, dict):
            continue
        os_name = (node.get("os") or "").strip()
        if os_name:
            counts[os_name] += 1
    if not counts:
        return ""
    parts = [f"{os_name}={count}" for os_name, count in sorted(counts.items(), key=lambda item: (-item[1], item[0]))]
    return "OS mix: " + ", ".join(parts[:5]) + "."
 def _pods_summary_line(metrics: dict[str, Any]) -> str:
    if not metrics:
        return ""
    running = metrics.get("pods_running")
    pending = metrics.get("pods_pending")
    failed = metrics.get("pods_failed")
    succeeded = metrics.get("pods_succeeded")
    parts: list[str] = []
    if running is not None:
        parts.append(f"{running:.0f} running")
    if pending is not None:
        parts.append(f"{pending:.0f} pending")
    if failed is not None:
        parts.append(f"{failed:.0f} failed")
    if succeeded is not None:
        parts.append(f"{succeeded:.0f} succeeded")
    if not parts:
        return ""
    return "Pods: " + ", ".join(parts) + "."
 def _postgres_summary_line(metrics: dict[str, Any]) -> str:
    if not metrics:
        return ""
    postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
    if not postgres:
        return ""
    used = postgres.get("used")
    max_conn = postgres.get("max")
    hottest = postgres.get("hottest_db") if isinstance(postgres.get("hottest_db"), dict) else {}
    parts: list[str] = []
    if used is not None and max_conn is not None:
        parts.append(f"{used:.0f}/{max_conn:.0f} connections")
    if hottest.get("label"):
        hot_val = hottest.get("value")
        hot_val_str = _format_metric_value(str(hot_val), percent=False) if hot_val is not None else ""
        parts.append(f"hottest {hottest.get('label')} ({hot_val_str})")
    if not parts:
        return ""
    return "Postgres: " + ", ".join(parts) + "."
 def _hottest_summary_line(metrics: dict[str, Any]) -> str:
    if not metrics:
        return ""
    hottest = metrics.get("hottest_nodes") if isinstance(metrics.get("hottest_nodes"), dict) else {}
    if not hottest:
        return ""
    parts: list[str] = []
    for key in ("cpu", "ram", "net", "io"):
        entry = hottest.get(key) if isinstance(hottest.get(key), dict) else {}
        node = entry.get("node")
        value = entry.get("value")
        if node and value is not None:
            value_fmt = _format_metric_value(
                str(value),
                percent=key in ("cpu", "ram"),
                rate=key in ("net", "io"),
            )
            parts.append(f"{key.upper()} {node} ({value_fmt})")
    if not parts:
        return ""
    return "Hottest nodes: " + "; ".join(parts) + "."
 def cluster_overview_answer(
    prompt: str,
    *,
    inventory: list[dict[str, Any]],
    snapshot: dict[str, Any] | None,
 ) -> str:
    if not inventory and not snapshot:
        return ""
    q = normalize_query(prompt)
    metrics = _snapshot_metrics(snapshot)
    lines: list[str] = []
    nodes_line = _nodes_summary_line(inventory, snapshot)
    if nodes_line:
        lines.append(nodes_line)
    if any(word in q for word in ("hardware", "architecture", "nodes", "node", "cluster", "atlas", "titan", "lab")):
        hw_line = _hardware_mix_line(inventory)
        if hw_line:
            lines.append(hw_line)
        os_line = _os_mix_line(snapshot)
        if os_line:
            lines.append(os_line)
    if any(
        word in q
        for word in (
            "interesting",
            "status",
            "health",
            "overview",
            "summary",
            "tell me",
            "what do you know",
            "about",
            "pods",
            "postgres",
            "connections",
            "hottest",
            "cpu",
            "ram",
            "memory",
            "net",
            "network",
            "io",
            "disk",
            "busy",
            "load",
            "usage",
            "utilization",
        )
    ):
        pods_line = _pods_summary_line(metrics)
        if pods_line:
            lines.append(pods_line)
        hottest_line = _hottest_summary_line(metrics)
        if hottest_line:
            lines.append(hottest_line)
        postgres_line = _postgres_summary_line(metrics)
        if postgres_line:
            lines.append(postgres_line)
    if not lines:
        return ""
    return "Based on the snapshot, " + "\n".join(lines)
 def cluster_answer(
    prompt: str,
    *,
    inventory: list[dict[str, Any]],
    snapshot: dict[str, Any] | None,
    workloads: list[dict[str, Any]] | None,
 ) -> str:
    metrics_summary = snapshot_context(prompt, snapshot)
    structured = structured_answer(
        prompt,
        inventory=inventory,
        metrics_summary=metrics_summary,
        snapshot=snapshot,
        workloads=workloads,
    )
    if structured:
        return structured
    overview = cluster_overview_answer(prompt, inventory=inventory, snapshot=snapshot)
    if overview:
        kb_titles = kb_retrieve_titles(prompt, limit=4) if _knowledge_intent(prompt) else ""
        if kb_titles:
            overview = overview + "\n" + kb_titles
        return _format_confidence(overview, "medium")
    kb_titles = kb_retrieve_titles(prompt, limit=4)
    if kb_titles:
        return _format_confidence(kb_titles, "low")
    if metrics_summary:
        return _format_confidence(metrics_summary, "low")
    return ""
 def _metric_tokens(entry: dict[str, Any]) -> str:
    parts: list[str] = []
    for key in ("panel_title", "dashboard", "description"):
@ -1868,16 +2090,24 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                workloads=workloads,
            )
        fallback = "I don't have enough data to answer that."
        llm_prompt = cleaned
        if cluster_query:
-            llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned}"
+            answer = cluster_answer(
-        answer = ollama_reply(
+                cleaned,
-            ("http", "internal"),
+                inventory=inventory,
-            llm_prompt,
+                snapshot=snapshot,
-            context=context,
+                workloads=workloads,
-            fallback=fallback,
+            )
-            use_history=False,
+            if not answer:
-        )
+                answer = fallback
        else:
            llm_prompt = cleaned
            answer = ollama_reply(
                ("http", "internal"),
                llm_prompt,
                context=context,
                fallback=fallback,
                use_history=False,
            )
        self._write_json(200, {"answer": answer})
@ -2044,6 +2274,7 @@ def _knowledge_intent(prompt: str) -> bool:
        for phrase in (
            "what do you know",
            "tell me about",
            "interesting",
            "overview",
            "summary",
            "describe",
@ -2312,21 +2543,30 @@ def sync_loop(token: str, room_id: str):
                    res = vm_query(promql, timeout=20)
                    rendered = vm_render_result(res, limit=15) or "(no results)"
                    extra = "VictoriaMetrics (PromQL result):\n" + rendered
-                    context = (context + "\n\n" + extra).strip() if context else extra
+                    send_msg(token, rid, extra)
                    continue
                fallback = "I don't have enough data to answer that."
                llm_prompt = cleaned_body
                if cluster_query:
-                    llm_prompt = f"Atlas cluster question (use the cluster snapshot context): {cleaned_body}"
+                    reply = cluster_answer(
-                reply = ollama_reply_with_thinking(
+                        cleaned_body,
-                    token,
+                        inventory=inventory,
-                    rid,
+                        snapshot=snapshot,
-                    hist_key,
+                        workloads=workloads,
-                    llm_prompt,
+                    )
-                    context=context,
+                    if not reply:
-                    fallback=fallback,
+                        reply = fallback
-                    use_history=cluster_query,
+                else:
-                )
+                    llm_prompt = cleaned_body
                    reply = ollama_reply_with_thinking(
                        token,
                        rid,
                        hist_key,
                        llm_prompt,
                        context=context,
                        fallback=fallback,
                        use_history=False,
                    )
                send_msg(token, rid, reply)
 def login_with_retry():