comms: handle arch node counts and extend LLM timeout

2026-01-26 09:36:08 -03:00 · 2026-01-26 09:36:08 -03:00 · 352d4991f4
commit 352d4991f4
parent 14d18048d5
1 changed files with 38 additions and 1 deletions
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -16,6 +16,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev"
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/")
 MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0")
 API_KEY = os.environ.get("CHAT_API_KEY", "")
+OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90"))

 KB_DIR = os.environ.get("KB_DIR", "")
 VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428")
@ -525,6 +526,29 @@ def nodes_names_summary(cluster_name: str) -> str:
    shown = ", ".join(names[:30])
    return f"{cluster_name} node names: {shown}, … (+{len(names) - 30} more)."

+
+def nodes_arch_summary(cluster_name: str, arch: str) -> str:
+    try:
+        data = k8s_get("/api/v1/nodes?limit=500")
+    except Exception:
+        return ""
+    items = data.get("items") or []
+    if not isinstance(items, list) or not items:
+        return ""
+    normalized = (arch or "").strip().lower()
+    if normalized in ("aarch64", "arm64"):
+        arch_label = "arm64"
+    elif normalized in ("x86_64", "x86-64", "amd64"):
+        arch_label = "amd64"
+    else:
+        arch_label = normalized
+    total = 0
+    for node in items:
+        labels = (node.get("metadata") or {}).get("labels") or {}
+        if labels.get("kubernetes.io/arch") == arch_label:
+            total += 1
+    return f"{cluster_name} cluster has {total} {arch_label} nodes."
+
 def _strip_code_fence(text: str) -> str:
    cleaned = (text or "").strip()
    match = CODE_FENCE_RE.match(cleaned)
@ -622,7 +646,7 @@ def ollama_reply(hist_key, prompt: str, *, context: str) -> str:
        if API_KEY:
            headers["x-api-key"] = API_KEY
        r = request.Request(OLLAMA_URL, data=json.dumps(payload).encode(), headers=headers)
-        with request.urlopen(r, timeout=20) as resp:
+        with request.urlopen(r, timeout=OLLAMA_TIMEOUT_SEC) as resp:
            data = json.loads(resp.read().decode())
            raw_reply = data.get("message") or data.get("response") or data.get("reply") or data
            reply = _normalize_reply(raw_reply) or "I'm here to help."
@ -692,6 +716,19 @@ def sync_loop(token: str, room_id: str):
                            continue
                        send_msg(token, rid, summary)
                        continue
+                if "node" in lower_body and any(word in lower_body for word in ("arm64", "aarch64", "amd64", "x86_64", "x86-64")):
+                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
+                        arch = "arm64" if "arm64" in lower_body or "aarch64" in lower_body else "amd64"
+                        summary = nodes_arch_summary("Atlas", arch)
+                        if not summary:
+                            send_msg(
+                                token,
+                                rid,
+                                "I couldn’t reach the cluster API to count nodes by architecture. Try again in a moment.",
+                            )
+                            continue
+                        send_msg(token, rid, summary)
+                        continue
                if re.search(r"\bnode names?\b|\bnodes? named\b|\bnaming\b", lower_body):
                    if any(word in lower_body for word in ("cluster", "atlas", "titan")):
                        names_summary = nodes_names_summary("Atlas")