diff --git a/services/communication/atlasbot-configmap.yaml b/services/communication/atlasbot-configmap.yaml index dfbdd2c..672c4f4 100644 --- a/services/communication/atlasbot-configmap.yaml +++ b/services/communication/atlasbot-configmap.yaml @@ -61,6 +61,23 @@ data: "othrys", } + METRIC_HINT_WORDS = { + "health", + "status", + "down", + "slow", + "error", + "unknown_error", + "timeout", + "crash", + "crashloop", + "restart", + "restarts", + "pending", + "unreachable", + "latency", + } + def _tokens(text: str) -> list[str]: toks = [t.lower() for t in TOKEN_RE.findall(text or "")] return [t for t in toks if t not in STOPWORDS and len(t) >= 2] @@ -357,6 +374,42 @@ data: except Exception: return None + def _vm_value_series(res: dict) -> list[dict]: + if not res or (res.get("status") != "success"): + return [] + data = res.get("data") or {} + result = data.get("result") or [] + return result if isinstance(result, list) else [] + + def vm_render_result(res: dict | None, limit: int = 12) -> str: + if not res: + return "" + series = _vm_value_series(res) + if not series: + return "" + out: list[str] = [] + for r in series[:limit]: + if not isinstance(r, dict): + continue + metric = r.get("metric") or {} + value = r.get("value") or [] + val = value[1] if isinstance(value, list) and len(value) > 1 else "" + # Prefer common labels if present. + label_parts = [] + for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"): + if isinstance(metric, dict) and metric.get(k): + label_parts.append(f"{k}={metric.get(k)}") + if not label_parts and isinstance(metric, dict): + for k in sorted(metric.keys()): + if k.startswith("__"): + continue + label_parts.append(f"{k}={metric.get(k)}") + if len(label_parts) >= 4: + break + labels = ", ".join(label_parts) if label_parts else "series" + out.append(f"- {labels}: {val}") + return "\n".join(out) + def vm_top_restarts(hours: int = 1) -> str: q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))" res = vm_query(q) @@ -375,6 +428,26 @@ data: out.append(f"- restarts({hours}h): {ns}/{pod} = {val}") return "\n".join(out) + def vm_cluster_snapshot() -> str: + parts: list[str] = [] + # Node readiness (kube-state-metrics). + ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="true"})') + not_ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="false"})') + if ready and not_ready: + try: + r = _vm_value_series(ready)[0]["value"][1] + nr = _vm_value_series(not_ready)[0]["value"][1] + parts.append(f"- nodes ready: {r} (not ready: {nr})") + except Exception: + pass + + phases = vm_query("sum by (phase) (kube_pod_status_phase)") + pr = vm_render_result(phases, limit=8) + if pr: + parts.append("Pod phases:") + parts.append(pr) + return "\n".join(parts).strip() + # Conversation state. history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript) @@ -411,9 +484,14 @@ data: if flux_bad: parts.append("Flux (not ready):\n" + flux_bad) - restarts = vm_top_restarts(1) - if restarts: - parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts) + p_l = (prompt or "").lower() + if any(w in p_l for w in METRIC_HINT_WORDS): + restarts = vm_top_restarts(1) + if restarts: + parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts) + snap = vm_cluster_snapshot() + if snap: + parts.append("VictoriaMetrics (cluster snapshot):\n" + snap) return "\n\n".join([p for p in parts if p]).strip() @@ -500,6 +578,12 @@ data: # Only do live cluster/metrics introspection in DMs. allow_tools = is_dm + promql = "" + if allow_tools: + m = re.match(r"(?is)^\\s*promql\\s*(?:\\:|\\s)\\s*(.+?)\\s*$", body) + if m: + promql = m.group(1).strip() + # Attempt to scope tools to the most likely workloads when hostnames are mentioned. targets: list[tuple[str, str]] = [] for m in HOST_RE.finditer(body.lower()): @@ -512,6 +596,11 @@ data: targets.append((ns, str(w["name"]))) context = build_context(body, allow_tools=allow_tools, targets=targets) + if allow_tools and promql: + res = vm_query(promql, timeout=20) + rendered = vm_render_result(res, limit=15) or "(no results)" + extra = "VictoriaMetrics (PromQL result):\n" + rendered + context = (context + "\n\n" + extra).strip() if context else extra reply = ollama_reply(hist_key, body, context=context) send_msg(token, rid, reply) diff --git a/services/communication/atlasbot-deployment.yaml b/services/communication/atlasbot-deployment.yaml index 5c6c87e..528d4b2 100644 --- a/services/communication/atlasbot-deployment.yaml +++ b/services/communication/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: edd1d61d8010197b948343dff3d7a8913017e79a0a0098008213452f50361b44 + checksum/atlasbot-configmap: 80fa4d62ccafbfbcdeb63f0976cbea36aada12649f15f8570932296db5d48949 spec: serviceAccountName: atlasbot nodeSelector: