From b27c80d5c0f7221c663634491d37c08f1cccaa83 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 19:53:11 -0300 Subject: [PATCH] atlasbot: improve node inventory reasoning --- services/comms/atlasbot-deployment.yaml | 2 +- services/comms/scripts/atlasbot/bot.py | 154 +++++++++++++++++++----- 2 files changed, 122 insertions(+), 34 deletions(-) diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index c723d22..7cc66b3 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-21 + checksum/atlasbot-configmap: manual-atlasbot-22 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index d06645a..6993db2 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -89,9 +89,17 @@ METRIC_HINT_WORDS = { "latency", } -CODE_FENCE_RE = re.compile(r"^```(?:json)?\\s*(.*?)\\s*```$", re.DOTALL) -TITAN_NODE_RE = re.compile(r"\\btitan-[0-9a-z]{2}\\b", re.IGNORECASE) -TITAN_RANGE_RE = re.compile(r"\\btitan-([0-9a-z]{2})/([0-9a-z]{2})\\b", re.IGNORECASE) +CODE_FENCE_RE = re.compile(r"^```(?:json)?\s*(.*?)\s*```$", re.DOTALL) +TITAN_NODE_RE = re.compile(r"\btitan-[0-9a-z]{2}\b", re.IGNORECASE) +TITAN_RANGE_RE = re.compile(r"\btitan-([0-9a-z]{2})/([0-9a-z]{2})\b", re.IGNORECASE) +_DASH_CHARS = "\u2010\u2011\u2012\u2013\u2014\u2015\u2212\uFE63\uFF0D" + +def normalize_query(text: str) -> str: + cleaned = (text or "").lower() + for ch in _DASH_CHARS: + cleaned = cleaned.replace(ch, "-") + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] @@ -267,14 +275,15 @@ def kb_retrieve(query: str, *, limit: int = 3) -> str: return "\n".join(parts).strip() def _extract_titan_nodes(text: str) -> list[str]: - names = {n.lower() for n in TITAN_NODE_RE.findall(text or "") if n} - for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", text or "", re.IGNORECASE): + cleaned = normalize_query(text) + names = {n.lower() for n in TITAN_NODE_RE.findall(cleaned) if n} + for match in re.finditer(r"titan-([0-9a-z]{2}(?:[/,][0-9a-z]{2})+)", cleaned, re.IGNORECASE): tail = match.group(1) for part in re.split(r"[/,]", tail): part = part.strip() if part: names.add(f"titan-{part.lower()}") - for match in TITAN_RANGE_RE.finditer(text or ""): + for match in TITAN_RANGE_RE.finditer(cleaned): left, right = match.groups() if left: names.add(f"titan-{left.lower()}") @@ -323,6 +332,7 @@ def node_inventory_live() -> list[dict[str, Any]]: "arch": labels.get("kubernetes.io/arch") or labels.get("beta.kubernetes.io/arch") or "", "hardware": _hardware_class(labels), "roles": _node_roles(labels), + "is_worker": _node_is_worker(node), "ready": _node_ready_status(node), } ) @@ -335,7 +345,7 @@ def _group_nodes(inventory: list[dict[str, Any]]) -> dict[str, list[str]]: return {k: sorted(v) for k, v in grouped.items()} def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = None) -> str: - q = (query or "").lower() + q = normalize_query(query) if not any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster")): return "" if inventory is None: @@ -372,7 +382,7 @@ def node_inventory_context(query: str, inventory: list[dict[str, Any]] | None = return "\n".join(lines) def node_inventory_for_prompt(prompt: str) -> list[dict[str, Any]]: - q = (prompt or "").lower() + q = normalize_query(prompt) if any(word in q for word in ("node", "nodes", "raspberry", "rpi", "jetson", "amd64", "hardware", "cluster", "worker")): return node_inventory_live() return [] @@ -382,10 +392,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: ready = [node["name"] for node in inventory if node.get("ready") is True] not_ready = [node["name"] for node in inventory if node.get("ready") is False] groups = _group_nodes(inventory) - workers = [node for node in inventory if "worker" in (node.get("roles") or [])] + workers = [node for node in inventory if node.get("is_worker") is True] worker_names = [node["name"] for node in workers] worker_ready = [node["name"] for node in workers if node.get("ready") is True] worker_not_ready = [node["name"] for node in workers if node.get("ready") is False] + expected_workers = expected_worker_nodes_from_metrics() + expected_ready = [n for n in expected_workers if n in ready] if expected_workers else [] + expected_not_ready = [n for n in expected_workers if n in not_ready] if expected_workers else [] + expected_missing = [n for n in expected_workers if n not in names] if expected_workers else [] return { "names": sorted(names), "ready": sorted(ready), @@ -394,10 +408,14 @@ def _inventory_sets(inventory: list[dict[str, Any]]) -> dict[str, Any]: "worker_names": sorted(worker_names), "worker_ready": sorted(worker_ready), "worker_not_ready": sorted(worker_not_ready), + "expected_workers": expected_workers, + "expected_ready": sorted(expected_ready), + "expected_not_ready": sorted(expected_not_ready), + "expected_missing": sorted(expected_missing), } def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_summary: str) -> str: - q = (prompt or "").lower() + q = normalize_query(prompt) if metrics_summary and any(word in q for word in ("postgres", "connection", "connections", "db")): return metrics_summary @@ -412,29 +430,75 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s worker_names = sets["worker_names"] worker_ready = sets["worker_ready"] worker_not_ready = sets["worker_not_ready"] + expected_workers = sets["expected_workers"] + expected_ready = sets["expected_ready"] + expected_not_ready = sets["expected_not_ready"] + expected_missing = sets["expected_missing"] total = len(names) + nodes_in_query = _extract_titan_nodes(q) + rpi_nodes = set(groups.get("rpi4", [])) | set(groups.get("rpi5", [])) + non_rpi = set(groups.get("jetson", [])) | set(groups.get("amd64", [])) + unknown_hw = set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", [])) - for node in _extract_titan_nodes(q): - if node and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q): + if nodes_in_query and ("raspberry" in q or "rpi" in q): + parts: list[str] = [] + for node in nodes_in_query: + if node in rpi_nodes: + parts.append(f"{node} is a Raspberry Pi node.") + elif node in non_rpi: + parts.append(f"{node} is not a Raspberry Pi node.") + elif node in names: + parts.append(f"{node} is in Atlas but hardware is unknown.") + else: + parts.append(f"{node} is not in the Atlas cluster.") + return " ".join(parts) + + if nodes_in_query and "jetson" in q: + jets = set(groups.get("jetson", [])) + parts = [] + for node in nodes_in_query: + if node in jets: + parts.append(f"{node} is a Jetson node.") + elif node in names: + parts.append(f"{node} is not a Jetson node.") + else: + parts.append(f"{node} is not in the Atlas cluster.") + return " ".join(parts) + + if nodes_in_query and ("is" in q or "part of" in q or "in atlas" in q or "in cluster" in q or "present" in q or "exist" in q): + parts: list[str] = [] + for node in nodes_in_query: if node in names: - return f"Yes. {node} is in the Atlas cluster." - return f"No. {node} is not in the Atlas cluster." + parts.append(f"Yes. {node} is in the Atlas cluster.") + else: + parts.append(f"No. {node} is not in the Atlas cluster.") + return " ".join(parts) - if "non-raspberry" in q or "non raspberry" in q or "not raspberry" in q: - non_rpi = sorted(set(groups.get("jetson", [])) | set(groups.get("amd64", []))) - if "besides" in q: - amd = groups.get("amd64", []) + if any(term in q for term in ("non-raspberry", "non raspberry", "not raspberry", "non-rpi", "non rpi")): + non_rpi_sorted = sorted(non_rpi) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(non_rpi_sorted)} non‑Raspberry Pi nodes." + if any(phrase in q for phrase in ("besides jetson", "excluding jetson", "without jetson", "non jetson")): + amd = sorted(groups.get("amd64", [])) return f"Non‑Raspberry Pi nodes (excluding Jetson): {', '.join(amd)}." if amd else "No non‑Raspberry Pi nodes outside Jetson." - return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi)}." if non_rpi else "No non‑Raspberry Pi nodes found." + return f"Non‑Raspberry Pi nodes: {', '.join(non_rpi_sorted)}." if non_rpi_sorted else "No non‑Raspberry Pi nodes found." if "jetson" in q: jets = groups.get("jetson", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(jets)} Jetson nodes." return f"Jetson nodes: {', '.join(jets)}." if jets else "No Jetson nodes found." if "amd64" in q or "x86" in q: amd = groups.get("amd64", []) + if any(word in q for word in ("how many", "count", "number")): + return f"Atlas has {len(amd)} amd64 nodes." return f"amd64 nodes: {', '.join(amd)}." if amd else "No amd64 nodes found." + if "arm64" in q and "node" in q and any(word in q for word in ("how many", "count", "number")): + count = sum(1 for node in inventory if node.get("arch") == "arm64") + return f"Atlas has {count} arm64 nodes." + if "rpi4" in q: rpi4 = groups.get("rpi4", []) if any(word in q for word in ("how many", "count", "number")): @@ -448,28 +512,52 @@ def structured_answer(prompt: str, *, inventory: list[dict[str, Any]], metrics_s return f"rpi5 nodes: {', '.join(rpi5)}." if rpi5 else "No rpi5 nodes found." if "raspberry" in q or "rpi" in q: - rpi = sorted(set(groups.get("rpi4", [])) | set(groups.get("rpi5", []))) + rpi = sorted(rpi_nodes) if any(word in q for word in ("how many", "count", "number")): return f"Atlas has {len(rpi)} Raspberry Pi nodes." return f"Raspberry Pi nodes: {', '.join(rpi)}." if rpi else "No Raspberry Pi nodes found." if "arm64-unknown" in q or "unknown" in q or "no hardware" in q: - unknown = sorted(set(groups.get("arm64-unknown", [])) | set(groups.get("unknown", []))) + unknown = sorted(unknown_hw) return f"Unknown hardware nodes: {', '.join(unknown)}." if unknown else "No unknown hardware labels." - if "worker" in q and "node" in q: - if any(word in q for word in ("missing", "expected", "should")): - expected_workers = expected_worker_nodes_from_metrics() - missing = sorted(set(expected_workers) - set(worker_ready + worker_not_ready)) if expected_workers else [] - if "missing" in q and missing: - return "Missing worker nodes: " + ", ".join(missing) + "." - if expected_workers: - msg = f"Grafana inventory expects {len(expected_workers)} workers." - if missing: - msg += f" Missing: {', '.join(missing)}." + if ("notready" in q or "not ready" in q or "unready" in q) and ("node" in q or "nodes" in q): + return "Not ready nodes: " + (", ".join(not_ready) if not_ready else "none") + "." + + if "worker" in q and ("node" in q or "nodes" in q or "workers" in q): + not_ready_query = "not ready" in q or "unready" in q or "down" in q or ("not" in q and "ready" in q) + if expected_workers: + if "missing" in q: + return "Missing worker nodes: " + (", ".join(expected_missing) if expected_missing else "none") + "." + if "ready" in q and ("not ready" in q or "vs" in q or "versus" in q): + return ( + f"Expected workers: {len(expected_ready)} ready, " + f"{len(expected_not_ready)} not ready (expected {len(expected_workers)})." + ) + if any(word in q for word in ("how many", "count", "number")) and ("expect" in q or "expected" in q or "should" in q): + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if expected_missing: + msg += f" Missing: {', '.join(expected_missing)}." return msg - return "No expected worker inventory found; using live cluster state." - if "not ready" in q or "unready" in q or "down" in q: + if not_ready_query: + if expected_not_ready or expected_missing: + detail = [] + if expected_not_ready: + detail.append(f"Not ready: {', '.join(expected_not_ready)}") + if expected_missing: + detail.append(f"Missing: {', '.join(expected_missing)}") + return "Worker nodes needing attention. " + " ".join(detail) + "." + return "All expected worker nodes are Ready." + if any(word in q for word in ("expected", "expect", "should")): + msg = f"Grafana inventory expects {len(expected_workers)} worker nodes." + if expected_missing: + msg += f" Missing: {', '.join(expected_missing)}." + return msg + if any(word in q for word in ("how many", "count", "number")): + return f"Worker nodes: {len(expected_ready)} ready, {len(expected_not_ready)} not ready (expected {len(expected_workers)})." + if "ready" in q: + return f"Ready worker nodes ({len(expected_ready)}): {', '.join(expected_ready)}." + if not_ready_query: return "Worker nodes not ready: " + (", ".join(worker_not_ready) if worker_not_ready else "none") + "." if any(word in q for word in ("how many", "count", "number")): return f"Worker nodes: {len(worker_ready)} ready, {len(worker_not_ready)} not ready."