atlasbot: improve worker readiness and metrics replies

2026-01-26 18:16:14 -03:00 · 2026-01-26 18:16:14 -03:00 · 7bb1bd96fc
commit 7bb1bd96fc
parent be7846572f
2 changed files with 140 additions and 2 deletions
--- a/services/comms/atlasbot-deployment.yaml
+++ b/services/comms/atlasbot-deployment.yaml
@ -16,7 +16,7 @@ spec:
      labels:
        app: atlasbot
      annotations:
-        checksum/atlasbot-configmap: manual-atlasbot-13
+        checksum/atlasbot-configmap: manual-atlasbot-14
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "comms"
        vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret"
--- a/services/comms/scripts/atlasbot/bot.py
+++ b/services/comms/scripts/atlasbot/bot.py
@ -441,7 +441,7 @@ def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]:
        return "", f"{panel}: matched dashboard panel but VictoriaMetrics did not return data."
    summary = "\n".join(rendered_parts)
    context = f"Metrics (from {dashboard} / {panel}):\n{summary}"
-    fallback = f"{panel}: {summary}"
+    fallback = _metrics_fallback_summary(panel, summary)
    return context, fallback
 def jetson_nodes_from_kb() -> list[str]:
@ -654,6 +654,115 @@ def vm_render_result(res: dict | None, limit: int = 12) -> str:
        out.append(f"- {labels}: {val}")
    return "\n".join(out)
 def _parse_metric_lines(summary: str) -> dict[str, str]:
    parsed: dict[str, str] = {}
    for line in (summary or "").splitlines():
        line = line.strip()
        if not line.startswith("-"):
            continue
        try:
            label, value = line.lstrip("-").split(":", 1)
        except ValueError:
            continue
        parsed[label.strip()] = value.strip()
    return parsed
 def _metrics_fallback_summary(panel: str, summary: str) -> str:
    parsed = _parse_metric_lines(summary)
    panel_l = (panel or "").lower()
    if panel_l.startswith("postgres connections"):
        used = parsed.get("conn=used")
        maxv = parsed.get("conn=max")
        if used and maxv:
            try:
                used_i = int(float(used))
                max_i = int(float(maxv))
            except ValueError:
                return f"Postgres connections: {summary}"
            free = max_i - used_i
            return f"Postgres connections: {used_i}/{max_i} used ({free} free)."
    if panel_l.startswith("postgres hottest"):
        if parsed:
            label, value = next(iter(parsed.items()))
            return f"Most Postgres connections: {label} = {value}."
    return f"{panel}: {summary}"
 def _node_ready_status(node: dict) -> bool | None:
    conditions = node.get("status", {}).get("conditions") or []
    for cond in conditions if isinstance(conditions, list) else []:
        if cond.get("type") == "Ready":
            if cond.get("status") == "True":
                return True
            if cond.get("status") == "False":
                return False
            return None
    return None
 def _node_is_worker(node: dict) -> bool:
    labels = (node.get("metadata") or {}).get("labels") or {}
    if labels.get("node-role.kubernetes.io/control-plane") is not None:
        return False
    if labels.get("node-role.kubernetes.io/master") is not None:
        return False
    if labels.get("node-role.kubernetes.io/worker") is not None:
        return True
    return True
 def worker_nodes_status() -> tuple[list[str], list[str]]:
    try:
        data = k8s_get("/api/v1/nodes?limit=500")
    except Exception:
        return ([], [])
    items = data.get("items") or []
    ready_nodes: list[str] = []
    not_ready_nodes: list[str] = []
    for node in items if isinstance(items, list) else []:
        if not _node_is_worker(node):
            continue
        name = (node.get("metadata") or {}).get("name") or ""
        if not name:
            continue
        ready = _node_ready_status(node)
        if ready is True:
            ready_nodes.append(name)
        elif ready is False:
            not_ready_nodes.append(name)
    return (sorted(ready_nodes), sorted(not_ready_nodes))
 def expected_nodes_from_kb() -> set[str]:
    if not _NODE_CLASS_INDEX:
        return set()
    nodes = set().union(*_NODE_CLASS_INDEX.values())
    return {n for n in nodes if n and n not in _NODE_CLASS_EXTERNAL}
 def missing_nodes_answer(cluster_name: str) -> str:
    expected = expected_nodes_from_kb()
    if not expected:
        return ""
    current = set()
    try:
        data = k8s_get("/api/v1/nodes?limit=500")
        items = data.get("items") or []
        for node in items if isinstance(items, list) else []:
            name = (node.get("metadata") or {}).get("name") or ""
            if name:
                current.add(name)
    except Exception:
        return ""
    missing = sorted(expected - current)
    if not missing:
        return f"{cluster_name}: no missing nodes versus KB inventory."
    return f"{cluster_name} missing nodes versus KB inventory: {', '.join(missing)}."
 def _should_short_circuit(prompt: str, fallback: str) -> bool:
    if not fallback:
        return False
    lower = (prompt or "").lower()
    for word in ("why", "explain", "architecture", "breakdown", "root cause", "plan"):
        if word in lower:
            return False
    return True
 def vm_top_restarts(hours: int = 1) -> str:
    q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
    res = vm_query(q)
@ -984,6 +1093,32 @@ def sync_loop(token: str, room_id: str):
                            continue
                        send_msg(token, rid, summary)
                        continue
                if "worker" in lower_body and "node" in lower_body:
                    ready_nodes, not_ready_nodes = worker_nodes_status()
                    total = len(ready_nodes) + len(not_ready_nodes)
                    if total:
                        if any(word in lower_body for word in ("ready", "not ready", "unready")):
                            if not_ready_nodes:
                                send_msg(
                                    token,
                                    rid,
                                    f"Worker nodes not Ready: {', '.join(not_ready_nodes)}.",
                                )
                            else:
                                send_msg(token, rid, f"All {len(ready_nodes)} worker nodes are Ready.")
                            continue
                        if any(word in lower_body for word in ("how many", "should")):
                            send_msg(
                                token,
                                rid,
                                f"Atlas has {total} worker nodes; {len(ready_nodes)} Ready, {len(not_ready_nodes)} NotReady.",
                            )
                            continue
                if "missing" in lower_body and "node" in lower_body:
                    missing = missing_nodes_answer("Atlas")
                    if missing:
                        send_msg(token, rid, missing)
                        continue
                inventory_answer = node_inventory_answer("Atlas", lower_body)
                if inventory_answer:
                    send_msg(token, rid, inventory_answer)
@ -1046,6 +1181,9 @@ def sync_loop(token: str, room_id: str):
                    fallback = node_inventory_answer("Atlas", lower_body)
                if metrics_fallback and not fallback:
                    fallback = metrics_fallback
                if _should_short_circuit(body, fallback):
                    send_msg(token, rid, fallback)
                    continue
                reply = ollama_reply_with_thinking(
                    token,
                    rid,