diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index aa91dcb..d5d8f06 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-13 + checksum/atlasbot-configmap: manual-atlasbot-14 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index f4182cd..57549b3 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -441,7 +441,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data." summary = "\n".join(rendered_parts) context = f"Metrics (from {dashboard} / {panel}):\n{summary}" - fallback = f"{panel}: {summary}" + fallback = _metrics_fallback_summary(panel, summary) return context, fallback def jetson_nodes_from_kb() -> list[str]: @@ -654,6 +654,115 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str: out.append(f"- {labels}: {val}") return "\n".join(out) +def _parse_metric_lines(summary: str) -> dict[str, str]: + parsed: dict[str, str] = {} + for line in (summary or "").splitlines(): + line = line.strip() + if not line.startswith("-"): + continue + try: + label, value = line.lstrip("-").split(":", 1) + except ValueError: + continue + parsed[label.strip()] = value.strip() + return parsed + +def _metrics_fallback_summary(panel: str, summary: str) -> str: + parsed = _parse_metric_lines(summary) + panel_l = (panel or "").lower() + if panel_l.startswith("postgres connections"): + used = parsed.get("conn=used") + maxv = parsed.get("conn=max") + if used and maxv: + try: + used_i = int(float(used)) + max_i = int(float(maxv)) + except ValueError: + return f"Postgres connections: {summary}" + free = max_i - used_i + return f"Postgres connections: {used_i}/{max_i} used ({free} free)." + if panel_l.startswith("postgres hottest"): + if parsed: + label, value = next(iter(parsed.items())) + return f"Most Postgres connections: {label} = {value}." + return f"{panel}: {summary}" + +def _node_ready_status(node: dict) -> bool | None: + conditions = node.get("status", {}).get("conditions") or [] + for cond in conditions if isinstance(conditions, list) else []: + if cond.get("type") == "Ready": + if cond.get("status") == "True": + return True + if cond.get("status") == "False": + return False + return None + return None + +def _node_is_worker(node: dict) -> bool: + labels = (node.get("metadata") or {}).get("labels") or {} + if labels.get("node-role.kubernetes.io/control-plane") is not None: + return False + if labels.get("node-role.kubernetes.io/master") is not None: + return False + if labels.get("node-role.kubernetes.io/worker") is not None: + return True + return True + +def worker_nodes_status() -> tuple[list[str], list[str]]: + try: + data = k8s_get("/api/v1/nodes?limit=500") + except Exception: + return ([], []) + items = data.get("items") or [] + ready_nodes: list[str] = [] + not_ready_nodes: list[str] = [] + for node in items if isinstance(items, list) else []: + if not _node_is_worker(node): + continue + name = (node.get("metadata") or {}).get("name") or "" + if not name: + continue + ready = _node_ready_status(node) + if ready is True: + ready_nodes.append(name) + elif ready is False: + not_ready_nodes.append(name) + return (sorted(ready_nodes), sorted(not_ready_nodes)) + +def expected_nodes_from_kb() -> set[str]: + if not _NODE_CLASS_INDEX: + return set() + nodes = set().union(*_NODE_CLASS_INDEX.values()) + return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL} + +def missing_nodes_answer(cluster_name: str) -> str: + expected = expected_nodes_from_kb() + if not expected: + return "" + current = set() + try: + data = k8s_get("/api/v1/nodes?limit=500") + items = data.get("items") or [] + for node in items if isinstance(items, list) else []: + name = (node.get("metadata") or {}).get("name") or "" + if name: + current.add(name) + except Exception: + return "" + missing = sorted(expected - current) + if not missing: + return f"{cluster_name}: no missing nodes versus KB inventory." + return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}." + +def _should_short_circuit(prompt: str, fallback: str) -> bool: + if not fallback: + return False + lower = (prompt or "").lower() + for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"): + if word in lower: + return False + return True + def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" res = vm_query(q) @@ -984,6 +1093,32 @@ def sync_loop(token: str, room_id: str): continue send_msg(token, rid, summary) continue + if "worker" in lower_body and "node" in lower_body: + ready_nodes, not_ready_nodes = worker_nodes_status() + total = len(ready_nodes) + len(not_ready_nodes) + if total: + if any(word in lower_body for word in ("ready", "not ready", "unready")): + if not_ready_nodes: + send_msg( + token, + rid, + f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.", + ) + else: + send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.") + continue + if any(word in lower_body for word in ("how many", "should")): + send_msg( + token, + rid, + f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.", + ) + continue + if "missing" in lower_body and "node" in lower_body: + missing = missing_nodes_answer("Atlas") + if missing: + send_msg(token, rid, missing) + continue inventory_answer = node_inventory_answer("Atlas", lower_body) if inventory_answer: send_msg(token, rid, inventory_answer) @@ -1046,6 +1181,9 @@ def sync_loop(token: str, room_id: str): fallback = node_inventory_answer("Atlas", lower_body) if metrics_fallback and not fallback: fallback = metrics_fallback + if _should_short_circuit(body, fallback): + send_msg(token, rid, fallback) + continue reply = ollama_reply_with_thinking( token, rid,