atlasbot: add PromQL + cluster snapshot
This commit is contained in:
parent
b313569e2f
commit
221fda50a6
@ -61,6 +61,23 @@ data:
|
|||||||
"othrys",
|
"othrys",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
METRIC_HINT_WORDS = {
|
||||||
|
"health",
|
||||||
|
"status",
|
||||||
|
"down",
|
||||||
|
"slow",
|
||||||
|
"error",
|
||||||
|
"unknown_error",
|
||||||
|
"timeout",
|
||||||
|
"crash",
|
||||||
|
"crashloop",
|
||||||
|
"restart",
|
||||||
|
"restarts",
|
||||||
|
"pending",
|
||||||
|
"unreachable",
|
||||||
|
"latency",
|
||||||
|
}
|
||||||
|
|
||||||
def _tokens(text: str) -> list[str]:
|
def _tokens(text: str) -> list[str]:
|
||||||
toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
|
toks = [t.lower() for t in TOKEN_RE.findall(text or "")]
|
||||||
return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
|
return [t for t in toks if t not in STOPWORDS and len(t) >= 2]
|
||||||
@ -357,6 +374,42 @@ data:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _vm_value_series(res: dict) -> list[dict]:
|
||||||
|
if not res or (res.get("status") != "success"):
|
||||||
|
return []
|
||||||
|
data = res.get("data") or {}
|
||||||
|
result = data.get("result") or []
|
||||||
|
return result if isinstance(result, list) else []
|
||||||
|
|
||||||
|
def vm_render_result(res: dict | None, limit: int = 12) -> str:
|
||||||
|
if not res:
|
||||||
|
return ""
|
||||||
|
series = _vm_value_series(res)
|
||||||
|
if not series:
|
||||||
|
return ""
|
||||||
|
out: list[str] = []
|
||||||
|
for r in series[:limit]:
|
||||||
|
if not isinstance(r, dict):
|
||||||
|
continue
|
||||||
|
metric = r.get("metric") or {}
|
||||||
|
value = r.get("value") or []
|
||||||
|
val = value[1] if isinstance(value, list) and len(value) > 1 else ""
|
||||||
|
# Prefer common labels if present.
|
||||||
|
label_parts = []
|
||||||
|
for k in ("namespace", "pod", "container", "node", "instance", "job", "phase"):
|
||||||
|
if isinstance(metric, dict) and metric.get(k):
|
||||||
|
label_parts.append(f"{k}={metric.get(k)}")
|
||||||
|
if not label_parts and isinstance(metric, dict):
|
||||||
|
for k in sorted(metric.keys()):
|
||||||
|
if k.startswith("__"):
|
||||||
|
continue
|
||||||
|
label_parts.append(f"{k}={metric.get(k)}")
|
||||||
|
if len(label_parts) >= 4:
|
||||||
|
break
|
||||||
|
labels = ", ".join(label_parts) if label_parts else "series"
|
||||||
|
out.append(f"- {labels}: {val}")
|
||||||
|
return "\n".join(out)
|
||||||
|
|
||||||
def vm_top_restarts(hours: int = 1) -> str:
|
def vm_top_restarts(hours: int = 1) -> str:
|
||||||
q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
|
q = f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{hours}h])))"
|
||||||
res = vm_query(q)
|
res = vm_query(q)
|
||||||
@ -375,6 +428,26 @@ data:
|
|||||||
out.append(f"- restarts({hours}h): {ns}/{pod} = {val}")
|
out.append(f"- restarts({hours}h): {ns}/{pod} = {val}")
|
||||||
return "\n".join(out)
|
return "\n".join(out)
|
||||||
|
|
||||||
|
def vm_cluster_snapshot() -> str:
|
||||||
|
parts: list[str] = []
|
||||||
|
# Node readiness (kube-state-metrics).
|
||||||
|
ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="true"})')
|
||||||
|
not_ready = vm_query('sum(kube_node_status_condition{condition="Ready",status="false"})')
|
||||||
|
if ready and not_ready:
|
||||||
|
try:
|
||||||
|
r = _vm_value_series(ready)[0]["value"][1]
|
||||||
|
nr = _vm_value_series(not_ready)[0]["value"][1]
|
||||||
|
parts.append(f"- nodes ready: {r} (not ready: {nr})")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
phases = vm_query("sum by (phase) (kube_pod_status_phase)")
|
||||||
|
pr = vm_render_result(phases, limit=8)
|
||||||
|
if pr:
|
||||||
|
parts.append("Pod phases:")
|
||||||
|
parts.append(pr)
|
||||||
|
return "\n".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
# Conversation state.
|
# Conversation state.
|
||||||
history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript)
|
history = collections.defaultdict(list) # (room_id, sender|None) -> list[str] (short transcript)
|
||||||
@ -411,9 +484,14 @@ data:
|
|||||||
if flux_bad:
|
if flux_bad:
|
||||||
parts.append("Flux (not ready):\n" + flux_bad)
|
parts.append("Flux (not ready):\n" + flux_bad)
|
||||||
|
|
||||||
|
p_l = (prompt or "").lower()
|
||||||
|
if any(w in p_l for w in METRIC_HINT_WORDS):
|
||||||
restarts = vm_top_restarts(1)
|
restarts = vm_top_restarts(1)
|
||||||
if restarts:
|
if restarts:
|
||||||
parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
|
parts.append("VictoriaMetrics (top restarts 1h):\n" + restarts)
|
||||||
|
snap = vm_cluster_snapshot()
|
||||||
|
if snap:
|
||||||
|
parts.append("VictoriaMetrics (cluster snapshot):\n" + snap)
|
||||||
|
|
||||||
return "\n\n".join([p for p in parts if p]).strip()
|
return "\n\n".join([p for p in parts if p]).strip()
|
||||||
|
|
||||||
@ -500,6 +578,12 @@ data:
|
|||||||
# Only do live cluster/metrics introspection in DMs.
|
# Only do live cluster/metrics introspection in DMs.
|
||||||
allow_tools = is_dm
|
allow_tools = is_dm
|
||||||
|
|
||||||
|
promql = ""
|
||||||
|
if allow_tools:
|
||||||
|
m = re.match(r"(?is)^\\s*promql\\s*(?:\\:|\\s)\\s*(.+?)\\s*$", body)
|
||||||
|
if m:
|
||||||
|
promql = m.group(1).strip()
|
||||||
|
|
||||||
# Attempt to scope tools to the most likely workloads when hostnames are mentioned.
|
# Attempt to scope tools to the most likely workloads when hostnames are mentioned.
|
||||||
targets: list[tuple[str, str]] = []
|
targets: list[tuple[str, str]] = []
|
||||||
for m in HOST_RE.finditer(body.lower()):
|
for m in HOST_RE.finditer(body.lower()):
|
||||||
@ -512,6 +596,11 @@ data:
|
|||||||
targets.append((ns, str(w["name"])))
|
targets.append((ns, str(w["name"])))
|
||||||
|
|
||||||
context = build_context(body, allow_tools=allow_tools, targets=targets)
|
context = build_context(body, allow_tools=allow_tools, targets=targets)
|
||||||
|
if allow_tools and promql:
|
||||||
|
res = vm_query(promql, timeout=20)
|
||||||
|
rendered = vm_render_result(res, limit=15) or "(no results)"
|
||||||
|
extra = "VictoriaMetrics (PromQL result):\n" + rendered
|
||||||
|
context = (context + "\n\n" + extra).strip() if context else extra
|
||||||
reply = ollama_reply(hist_key, body, context=context)
|
reply = ollama_reply(hist_key, body, context=context)
|
||||||
send_msg(token, rid, reply)
|
send_msg(token, rid, reply)
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: atlasbot
|
app: atlasbot
|
||||||
annotations:
|
annotations:
|
||||||
checksum/atlasbot-configmap: edd1d61d8010197b948343dff3d7a8913017e79a0a0098008213452f50361b44
|
checksum/atlasbot-configmap: 80fa4d62ccafbfbcdeb63f0976cbea36aada12649f15f8570932296db5d48949
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: atlasbot
|
serviceAccountName: atlasbot
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user