atlasbot: add PromQL + cluster snapshot

2026-01-06 14:58:29 -03:00 · 2026-01-06 14:58:29 -03:00 · 221fda50a6
commit 221fda50a6
parent b313569e2f
2 changed files with 93 additions and 4 deletions
--- a/services/communication/atlasbot-configmap.yaml
+++ b/services/communication/atlasbot-configmap.yaml
@ -61,6 +61,23 @@ data:
        "othrys",
    }

+    METRIC_HINT_WORDS = {
+        "health",
+        "status",
+        "down",
+        "slow",
+        "error",
+        "unknown_error",
+        "timeout",
+        "crash",
+        "crashloop",
+        "restart",
+        "restarts",
+        "pending",
+        "unreachable",
+        "latency",
+    }
+
    def _tokens(text: str) -> list[str]:
        toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
        return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
@ -357,6 +374,42 @@ data:
        except Exception:
            return None

+    def _vm_value_series(res: dict) -> list[dict]:
+        if not res or (res.get("status") != "success"):
+            return []
+        data = res.get("data") or {}
+        result = data.get("result") or []
+        return result if isinstance(result, list) else []
+
+    def vm_render_result(res: dict | None, limit: int = 12) -> str:
+        if not res:
+            return ""
+        series = _vm_value_series(res)
+        if not series:
+            return ""
+        out: list[str] = []
+        for r in series[:limit]:
+            if not isinstance(r, dict):
+                continue
+            metric = r.get("metric") or {}
+            value = r.get("value") or []
+            val = value[1] if isinstance(value, list) and len(value) > 1 else ""
+            # Prefer common labels if present.
+            label_parts = []
+            for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"):
+                if isinstance(metric, dict) and metric.get(k):
+                    label_parts.append(f"{k}={metric.get(k)}")
+            if not label_parts and isinstance(metric, dict):
+                for k in sorted(metric.keys()):
+                    if k.startswith("__"):
+                        continue
+                    label_parts.append(f"{k}={metric.get(k)}")
+                    if len(label_parts) >= 4:
+                        break
+            labels = ", ".join(label_parts) if label_parts else "series"
+            out.append(f"- {labels}: {val}")
+        return "\n".join(out)
+
    def vm_top_restarts(hours: int = 1) -> str:
        q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
        res = vm_query(q)
@ -375,6 +428,26 @@ data:
                out.append(f"- restarts({hours}h): {ns}/{pod} = {val}")
        return "\n".join(out)

+    def vm_cluster_snapshot() -> str:
+        parts: list[str] = []
+        # Node readiness (kube-state-metrics).
+        ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="true"})')
+        not_ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="false"})')
+        if ready and not_ready:
+            try:
+                r = _vm_value_series(ready)[0]["value"][1]
+                nr = _vm_value_series(not_ready)[0]["value"][1]
+                parts.append(f"- nodes ready: {r} (not ready: {nr})")
+            except Exception:
+                pass
+
+        phases = vm_query("sum by (phase) (kube_pod_status_phase)")
+        pr = vm_render_result(phases, limit=8)
+        if pr:
+            parts.append("Pod phases:")
+            parts.append(pr)
+        return "\n".join(parts).strip()
+

    # Conversation state.
    history = collections.defaultdict(list)  # (room_id, sender|None) -> list[str] (short transcript)
@ -411,9 +484,14 @@ data:
            if flux_bad:
                parts.append("Flux (not ready):\n" + flux_bad)

-            restarts = vm_top_restarts(1)
-            if restarts:
-                parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
+            p_l = (prompt or "").lower()
+            if any(w in p_l for w in METRIC_HINT_WORDS):
+                restarts = vm_top_restarts(1)
+                if restarts:
+                    parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
+                snap = vm_cluster_snapshot()
+                if snap:
+                    parts.append("VictoriaMetrics (cluster snapshot):\n" + snap)

        return "\n\n".join([p for p in parts if p]).strip()

@ -500,6 +578,12 @@ data:
                    # Only do live cluster/metrics introspection in DMs.
                    allow_tools = is_dm

+                    promql = ""
+                    if allow_tools:
+                        m = re.match(r"(?is)^\\s*promql\\s*(?:\\:|\\s)\\s*(.+?)\\s*$", body)
+                        if m:
+                            promql = m.group(1).strip()
+
                    # Attempt to scope tools to the most likely workloads when hostnames are mentioned.
                    targets: list[tuple[str, str]] = []
                    for m in HOST_RE.finditer(body.lower()):
@ -512,6 +596,11 @@ data:
                                    targets.append((ns, str(w["name"])))

                    context = build_context(body, allow_tools=allow_tools, targets=targets)
+                    if allow_tools and promql:
+                        res = vm_query(promql, timeout=20)
+                        rendered = vm_render_result(res, limit=15) or "(no results)"
+                        extra = "VictoriaMetrics (PromQL result):\n" + rendered
+                        context = (context + "\n\n" + extra).strip() if context else extra
                    reply = ollama_reply(hist_key, body, context=context)
                    send_msg(token, rid, reply)

--- a/services/communication/atlasbot-deployment.yaml
+++ b/services/communication/atlasbot-deployment.yaml
@ -16,7 +16,7 @@ spec:
      labels:
        app: atlasbot
      annotations:
-        checksum/atlasbot-configmap: edd1d61d8010197b948343dff3d7a8913017e79a0a0098008213452f50361b44
+        checksum/atlasbot-configmap: 80fa4d62ccafbfbcdeb63f0976cbea36aada12649f15f8570932296db5d48949
    spec:
      serviceAccountName: atlasbot
      nodeSelector: