atlasbot: improve node inventory reasoning

2026-01-26 19:53:11 -03:00 · 2026-01-26 19:53:11 -03:00 · b27c80d5c0
commit b27c80d5c0
parent a61091c052
2 changed files with 122 additions and 34 deletions
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@ -16,7 +16,7 @@ spec:
      labels:
        app: atlasbot
      annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-21
+        checksum/atlasbot-configmap: manual-atlasbot-22
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "comms"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -89,9 +89,17 @@ METRIC_HINT_WORDS = {
    "latency",
 }

-CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL)
-TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE)
-TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE)
+CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL)
+TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE)
+TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE)
+_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D"
+
+def normalize_query(text: str) -> str:
+    cleaned = (text or "").lower()
+    for ch in _DASH_CHARS:
+        cleaned = cleaned.replace(ch, "-")
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned

 def _tokens(text: str) -> list[str]:
    toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str:
    return "\n".join(parts).strip()

 def _extract_titan_nodes(text: str) -> list[str]:
-    names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n}
-    for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE):
+    cleaned = normalize_query(text)
+    names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n}
+    for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE):
        tail = match.group(1)
        for part in re.split(r"[/,]", tail):
            part = part.strip()
            if part:
                names.add(f"titan-{part.lower()}")
-    for match in TITAN_RANGE_RE.finditer(text or ""):
+    for match in TITAN_RANGE_RE.finditer(cleaned):
        left, right = match.groups()
        if left:
            names.add(f"titan-{left.lower()}")
@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]:
                "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "",
                "hardware": _hardware_class(labels),
                "roles": _node_roles(labels),
+                "is_worker": _node_is_worker(node),
                "ready": _node_ready_status(node),
            }
        )
@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]:
    return {k: sorted(v) for k, v in grouped.items()}

 def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str:
-    q = (query or "").lower()
+    q = normalize_query(query)
    if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")):
        return ""
    if inventory is None:
@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None =
    return "\n".join(lines)

 def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]:
-    q = (prompt or "").lower()
+    q = normalize_query(prompt)
    if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")):
        return node_inventory_live()
    return []
@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
    ready = [node["name"] for node in inventory if node.get("ready") is True]
    not_ready = [node["name"] for node in inventory if node.get("ready") is False]
    groups = _group_nodes(inventory)
-    workers = [node for node in inventory if "worker" in (node.get("roles") or [])]
+    workers = [node for node in inventory if node.get("is_worker") is True]
    worker_names = [node["name"] for node in workers]
    worker_ready = [node["name"] for node in workers if node.get("ready") is True]
    worker_not_ready = [node["name"] for node in workers if node.get("ready") is False]
+    expected_workers = expected_worker_nodes_from_metrics()
+    expected_ready = [n for n in expected_workers if n in ready] if expected_workers else []
+    expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else []
+    expected_missing = [n for n in expected_workers if n not in names] if expected_workers else []
    return {
        "names": sorted(names),
        "ready": sorted(ready),
@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]:
        "worker_names": sorted(worker_names),
        "worker_ready": sorted(worker_ready),
        "worker_not_ready": sorted(worker_not_ready),
+        "expected_workers": expected_workers,
+        "expected_ready": sorted(expected_ready),
+        "expected_not_ready": sorted(expected_not_ready),
+        "expected_missing": sorted(expected_missing),
    }

 def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str:
-    q = (prompt or "").lower()
+    q = normalize_query(prompt)
    if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")):
        return metrics_summary

@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
    worker_names = sets["worker_names"]
    worker_ready = sets["worker_ready"]
    worker_not_ready = sets["worker_not_ready"]
+    expected_workers = sets["expected_workers"]
+    expected_ready = sets["expected_ready"]
+    expected_not_ready = sets["expected_not_ready"]
+    expected_missing = sets["expected_missing"]
    total = len(names)
+    nodes_in_query = _extract_titan_nodes(q)
+    rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))
+    non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", []))
+    unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))

-    for node in _extract_titan_nodes(q):
-        if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q):
+    if nodes_in_query and ("raspberry" in q or "rpi" in q):
+        parts: list[str] = []
+        for node in nodes_in_query:
+            if node in rpi_nodes:
+                parts.append(f"{node} is a Raspberry Pi node.")
+            elif node in non_rpi:
+                parts.append(f"{node} is not a Raspberry Pi node.")
+            elif node in names:
+                parts.append(f"{node} is in Atlas but hardware is unknown.")
+            else:
+                parts.append(f"{node} is not in the Atlas cluster.")
+        return " ".join(parts)
+
+    if nodes_in_query and "jetson" in q:
+        jets = set(groups.get("jetson", []))
+        parts = []
+        for node in nodes_in_query:
+            if node in jets:
+                parts.append(f"{node} is a Jetson node.")
+            elif node in names:
+                parts.append(f"{node} is not a Jetson node.")
+            else:
+                parts.append(f"{node} is not in the Atlas cluster.")
+        return " ".join(parts)
+
+    if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q):
+        parts: list[str] = []
+        for node in nodes_in_query:
            if node in names:
-                return f"Yes. {node} is in the Atlas cluster."
-            return f"No. {node} is not in the Atlas cluster."
+                parts.append(f"Yes. {node} is in the Atlas cluster.")
+            else:
+                parts.append(f"No. {node} is not in the Atlas cluster.")
+        return " ".join(parts)

-    if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q:
-        non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", [])))
-        if "besides" in q:
-            amd = groups.get("amd64", [])
+    if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")):
+        non_rpi_sorted = sorted(non_rpi)
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes."
+        if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")):
+            amd = sorted(groups.get("amd64", []))
            return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson."
-        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found."
+        return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found."

    if "jetson" in q:
        jets = groups.get("jetson", [])
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(jets)} Jetson nodes."
        return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found."

    if "amd64" in q or "x86" in q:
        amd = groups.get("amd64", [])
+        if any(word in q for word in ("how many", "count", "number")):
+            return f"Atlas has {len(amd)} amd64 nodes."
        return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found."

+    if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")):
+        count = sum(1 for node in inventory if node.get("arch") == "arm64")
+        return f"Atlas has {count} arm64 nodes."
+
    if "rpi4" in q:
        rpi4 = groups.get("rpi4", [])
        if any(word in q for word in ("how many", "count", "number")):
@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s
        return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found."

    if "raspberry" in q or "rpi" in q:
-        rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])))
+        rpi = sorted(rpi_nodes)
        if any(word in q for word in ("how many", "count", "number")):
            return f"Atlas has {len(rpi)} Raspberry Pi nodes."
        return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found."

    if "arm64-unknown" in q or "unknown" in q or "no hardware" in q:
-        unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])))
+        unknown = sorted(unknown_hw)
        return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels."

-    if "worker" in q and "node" in q:
-        if any(word in q for word in ("missing", "expected", "should")):
-            expected_workers = expected_worker_nodes_from_metrics()
-            missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else []
-            if "missing" in q and missing:
-                return "Missing worker nodes: " + ", ".join(missing) + "."
-            if expected_workers:
-                msg = f"Grafana inventory expects {len(expected_workers)} workers."
-                if missing:
-                    msg += f" Missing: {', '.join(missing)}."
+    if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q):
+        return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "."
+
+    if "worker" in q and ("node" in q or "nodes" in q or "workers" in q):
+        not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q)
+        if expected_workers:
+            if "missing" in q:
+                return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "."
+            if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q):
+                return (
+                    f"Expected workers: {len(expected_ready)} ready, "
+                    f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})."
+                )
+            if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q):
+                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+                if expected_missing:
+                    msg += f" Missing: {', '.join(expected_missing)}."
                return msg
-            return "No expected worker inventory found; using live cluster state."
-        if "not ready" in q or "unready" in q or "down" in q:
+            if not_ready_query:
+                if expected_not_ready or expected_missing:
+                    detail = []
+                    if expected_not_ready:
+                        detail.append(f"Not ready: {', '.join(expected_not_ready)}")
+                    if expected_missing:
+                        detail.append(f"Missing: {', '.join(expected_missing)}")
+                    return "Worker nodes needing attention. " + " ".join(detail) + "."
+                return "All expected worker nodes are Ready."
+            if any(word in q for word in ("expected", "expect", "should")):
+                msg = f"Grafana inventory expects {len(expected_workers)} worker nodes."
+                if expected_missing:
+                    msg += f" Missing: {', '.join(expected_missing)}."
+                return msg
+            if any(word in q for word in ("how many", "count", "number")):
+                return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})."
+            if "ready" in q:
+                return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}."
+        if not_ready_query:
            return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "."
        if any(word in q for word in ("how many", "count", "number")):
            return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."