atlasbot: refine cluster intent handling

2026-01-27 22:44:49 -03:00 · 2026-01-27 22:44:49 -03:00 · 23533e08ee
commit 23533e08ee
parent fc10eed704
1 changed files with 87 additions and 5 deletions
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -152,6 +152,16 @@ CLUSTER_HINT_WORDS = {
    "deployment",
    "daemonset",
    "statefulset",
+    "snapshot",
+    "anomaly",
+    "anomalies",
+    "monitor",
+    "monitoring",
+    "runbook",
+    "runbooks",
+    "documentation",
+    "docs",
+    "playbook",
    "grafana",
    "victoria",
    "prometheus",
@ -203,6 +213,12 @@ _INSIGHT_HINT_WORDS = {
    "favorite",
    "favourite",
    "trivia",
+    "anomaly",
+    "anomalies",
+    "monitor",
+    "monitoring",
+    "alert",
+    "alerts",
    "stand out",
    "stands out",
 }
@ -532,7 +548,14 @@ def _humanize_rate(value: str, *, unit: str) -> str:
    return f"{val:.2f} B/s"

 def _has_any(text: str, phrases: tuple[str, ...]) -> bool:
-    return any(p in text for p in phrases)
+    for phrase in phrases:
+        if " " in phrase:
+            if phrase in text:
+                return True
+        else:
+            if re.search(rf"\\b{re.escape(phrase)}\\b", text):
+                return True
+    return False

 def _detect_operation(q: str) -> str | None:
    if _has_any(q, OPERATION_HINTS["top"]):
@ -552,6 +575,8 @@ def _detect_metric(q: str) -> str | None:
            part = part.strip()
            if len(part) >= 2:
                expanded.add(part)
+            if part.endswith("s") and len(part) >= 4:
+                expanded.add(part[:-1])
    tokens = expanded
    for metric, phrases in METRIC_HINTS.items():
        for phrase in phrases:
@ -565,6 +590,8 @@ def _detect_metric(q: str) -> str | None:
 def _detect_hardware_filters(q: str) -> tuple[set[str], set[str]]:
    include: set[str] = set()
    exclude: set[str] = set()
+    if any(term in q for term in ("gpu", "gpus", "accelerator", "accelerators", "cuda", "nvidia")):
+        include.add("jetson")
    rpi_specific = any(
        phrase in q
        for phrase in (
@ -1287,6 +1314,10 @@ def snapshot_metric_answer(
        failed = metrics.get("pods_failed")
        succeeded = metrics.get("pods_succeeded")
        status_terms = ("running", "pending", "failed", "succeeded", "completed")
+        if "not running" in q or "not in running" in q or "non running" in q:
+            parts = [v for v in (pending, failed, succeeded) if isinstance(v, (int, float))]
+            if parts:
+                return _format_confidence(f"Pods not running: {sum(parts):.0f}.", "high")
        if sum(1 for term in status_terms if term in q) > 1:
            parts = []
            if running is not None:
@ -1350,6 +1381,8 @@ def structured_answer(
        op = "top"
    entity = _detect_entity(q)
    include_hw, exclude_hw = _detect_hardware_filters(q)
+    if entity is None and (include_hw or exclude_hw):
+        entity = "node"
    nodes_in_query = _extract_titan_nodes(q)
    only_workers = "worker" in q or "workers" in q
    role_filters = _detect_role_filters(q)
@ -1385,6 +1418,20 @@ def structured_answer(
        if hw_line:
            return _format_confidence(hw_line, "medium")

+    if (
+        entity == "node"
+        and any(term in q for term in ("arm64", "amd64"))
+        and any(term in q for term in ("mostly", "majority", "more"))
+    ):
+        arm64_count = len([n for n in inventory if n.get("arch") == "arm64"])
+        amd64_count = len([n for n in inventory if n.get("arch") == "amd64"])
+        if arm64_count or amd64_count:
+            majority = "arm64" if arm64_count >= amd64_count else "amd64"
+            return _format_confidence(
+                f"arm64 nodes: {arm64_count}, amd64 nodes: {amd64_count}. Mostly {majority}.",
+                "high",
+            )
+
    if op == "top" and metric is None and not any(word in q for word in ("hardware", "architecture", "class")):
        metric = "cpu"

@ -1491,6 +1538,27 @@ def structured_answer(
            )

    if op == "count":
+        if only_workers and "ready" in q and ("total" in q or "vs" in q or "versus" in q):
+            total_workers = _inventory_filter(
+                inventory,
+                include_hw=include_hw,
+                exclude_hw=exclude_hw,
+                only_workers=True,
+                only_ready=None,
+                nodes_in_query=nodes_in_query,
+            )
+            ready_workers = _inventory_filter(
+                inventory,
+                include_hw=include_hw,
+                exclude_hw=exclude_hw,
+                only_workers=True,
+                only_ready=True,
+                nodes_in_query=nodes_in_query,
+            )
+            return _format_confidence(
+                f"Worker nodes ready: {len(ready_workers)} / {len(total_workers)} total.",
+                "high",
+            )
        if expected_workers and ("expected" in q or "should" in q):
            missing = sorted(set(expected_workers) - {n["name"] for n in inventory})
            msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
@ -1711,6 +1779,15 @@ def _doc_intent(query: str) -> bool:
            "how to",
            "instructions",
            "playbook",
+            "next step",
+            "next steps",
+            "what should",
+            "what do i",
+            "what to do",
+            "troubleshoot",
+            "triage",
+            "recover",
+            "remediate",
        )
    )

@ -2615,10 +2692,13 @@ def _candidate_note(candidate: dict[str, Any]) -> str:
 def _ensure_scores(answer: str) -> str:
    text = answer.strip()
    lines = [line.strip() for line in text.splitlines() if line.strip()]
-    has_relevance = any(line.lower().startswith("relevance") for line in lines)
-    has_satisfaction = any(line.lower().startswith("satisfaction") for line in lines)
-    has_confidence = any(line.lower().startswith("confidence") for line in lines)
-    has_risk = any(line.lower().startswith("hallucinationrisk") for line in lines)
+    def _score_key(line: str) -> str:
+        cleaned = line.strip().lstrip("-•* ").strip()
+        return cleaned.lower()
+    has_relevance = any(_score_key(line).startswith("relevance") for line in lines)
+    has_satisfaction = any(_score_key(line).startswith("satisfaction") for line in lines)
+    has_confidence = any(_score_key(line).startswith("confidence") for line in lines)
+    has_risk = any(_score_key(line).startswith("hallucinationrisk") for line in lines)
    if not has_confidence:
        lines.append("Confidence: medium")
    if not has_relevance:
@ -3004,6 +3084,7 @@ class _AtlasbotHandler(BaseHTTPRequestHandler):
                _is_subjective_query(cleaned)
                or _knowledge_intent(cleaned)
                or _is_overview_query(cleaned)
+                or _doc_intent(cleaned)
            )
            if open_ended:
                answer = open_ended_answer(
@ -3558,6 +3639,7 @@ def sync_loop(token: str, room_id: str):
                        _is_subjective_query(cleaned_body)
                        or _knowledge_intent(cleaned_body)
                        or _is_overview_query(cleaned_body)
+                        or _doc_intent(cleaned_body)
                    )
                    if open_ended:
                        reply = open_ended_with_thinking(